In [41]:
import pandas as pd
import numpy as np
import nltk
import gensim
from langdetect import detect
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

In [42]:
def import_data(path, sep="|"):
    return pd.read_csv(path, sep)

def save_trained_model(model, path):
    joblib.dump(model, path)


def __drop_duplicates(df):
    return df.drop_duplicates()


def __balance_data(df):
    df_positive = df[df.label == 1]
    df_negative = df[df.label == 0]
    pos_vs_neg = len(df_positive) - len(df_negative)
    if pos_vs_neg > 0:
        drop_indices = np.random.choice(df_positive.index, pos_vs_neg, replace=False)
        df_positive = df_positive.drop (drop_indices)
    elif pos_vs_neg < 0:
        pos_vs_neg *= -1
        drop_indices = np.random.choice(df_negative.index, pos_vs_neg, replace=False)
        df_negative = df_negative.drop (drop_indices)
    else:
         return df   
    return pd.concat([df_positive, df_negative])


def __perform_stemming(review):
    stemmer = PorterStemmer()
    return "".join([stemmer.stem(word) for word in review.split()])


def __drop_non_english(df):
    drop_indices= []
    for index, row in df.iterrows():
        try:
            if detect(row['text']) != "en":
                drop_indices.append (index)
        except Exception:
             drop_indices.append (index)
    return df.drop(drop_indices)

    
def __is_numerical(token):
    return str.isnumeric(token)


def preprocessing(df, balance_data=False, drop_non_english=False, drop_duplicates=False, stemming=False, 
                  replace_numerical=False, replacement_of_numerical="NUMERICAL_TOKEN", lowercase=False):
    if balance_data:
        df = __balance_data(df)
    if drop_duplicates:
        df = __drop_duplicates(df)
    if drop_non_english:
        df = __drop_non_english(df)
    def tbt_cleaning(review):
        cleaned_tokens = []
        stemmer = None
        if stemming:
            stemmer = PorterStemmer()
        tokens = nltk.word_tokenize(review)
        for token in tokens:
            if stemming:
                token = stemmer.stem(token)
            if replace_numerical and __is_numerical(token):
                token = replacement_of_numerical
            if lowercase:
                token = token.lower()
            cleaned_tokens.append(token)
        return ' '.join(cleaned_tokens)
    if stemming or replace_numerical or lowercase:
        df.text = df.text.apply(tbt_cleaning)
        return df
    return df

In [43]:
df = import_data('reviews_rt_all.csv')
len(df)

102610

In [53]:
# Takes last n words if naive is set True. Otherwise, takes firs n/2 words and last n/2 words.
def take_n_words(review, naive=False, n=20):
    tokens = nltk.word_tokenize(review) 
    l = len(tokens)
    if l <= n:
        return tokens
    else:
        if naive:
            tokens.revers()
            return tokens[:n]
        else:
            first = []
            last = []
            total = 0
            for i in range(0, int(n/2)):
                if total < n:
                    first.append(tokens[i])
                    total += 1
                if total < n:
                    last.append(tokens[l - 1 - i])
                    total += 1
            last.reverse()
            return first + last

In [45]:
df['n_words'] = df['text'].map(lambda t: take_n_words(t, naive=False))

In [6]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [46]:
def vectorize(review_tokens, n=20):
    feature_vector = np.array([])
    for token in review_tokens:
        vector = None
        try:
            vector = word2vec[token]
        except:
            vector = np.zeros(300)
        feature_vector = np.concatenate([feature_vector, vector])
    tokens_number = len(review_tokens)
    if tokens_number < n:
        feature_vector = np.concatenate([feature_vector, np.zeros(300 * (n - tokens_number))])
    return feature_vector.astype(np.float32)

In [47]:
df['features'] = df['n_words'].map(lambda w: vectorize(w))

In [49]:
# Create validation and training dataset
y = df.label
X_train, X_test, y_train, y_test = train_test_split(np.stack(df.features), y, test_size=0.2, random_state=0)

In [50]:
# Create model
classifier = PassiveAggressiveClassifier(C=0.001, fit_intercept = False, shuffle = False, n_iter = 91, n_jobs = -1)
classifier.fit(X_train, y_train)

PassiveAggressiveClassifier(C=0.001, class_weight=None, fit_intercept=False,
              loss='hinge', n_iter=91, n_jobs=-1, random_state=None,
              shuffle=False, verbose=0, warm_start=False)

In [51]:
# Create prediction label
predictions = classifier.predict(X_test)

In [52]:
# Calculate accuracy and score
print ("Accuracy:", metrics.accuracy_score(y_test, predictions))

Accuracy: 0.734869895722


In [216]:
# Save model
save_trained_model(classifier, 'model.pkl')