In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import re
from langdetect import detect

In [21]:
# remove non-English reviews
def remove_nonenglish_reviews(df):
    language_list = list()
    for i in range(len(df)):
        try: lang = detect(df.text[i])
        except: lang = 'none'
        language_list.append(lang)
    language_list = pd.Series(language_list)
    df = df[language_list == 'en']
    return df

# remove duplicates
def remove_duplicates(df):
    df = df[df.duplicated(subset = 'text')==False]
    return df

In [22]:
#tokenize text
def tokenize_text(text):
    token_text = re.findall('[A-Za-z]+',text.lower())
    clean_text = [WordNetLemmatizer().lemmatize(w) for w in token_text]
    final_text = [WordNetLemmatizer().lemmatize(w,pos='v') for w in clean_text]
    return final_text

In [23]:
#vectorization
def build_feature_matrices(X):
    vectorizer = CountVectorizer(tokenizer=tokenize_text, max_features=3000)
    X_transform = vectorizer.fit_transform(X).toarray()
    features_voc = vectorizer.get_feature_names()
    return X_transform, features_voc

def build_feature_matrices_voc(X):
    vectorizer = CountVectorizer(tokenizer=tokenize_text, vocabulary = new_features)
    X_transform = vectorizer.fit_transform(X).toarray()
    return X_transform

def build_feature_matrices_test(X):
    # vectorize using loaded features
    vectorizer = CountVectorizer(tokenizer=tokenize_text, vocabulary = new_features)
    #vectorizer = TfidfVectorizer(tokenizer=tokenize_text, vocabulary = features_voc)
    X_transform = vectorizer.fit_transform(X).toarray()
    return X_transform

In [27]:
def clean_features(features_voc):
    new_features = list()
    for word in features_voc:
        if len(list(swn.senti_synsets(word)))==0: continue
        if list(swn.senti_synsets(word))[0].obj_score()==1: continue
        new_features.append(word)
    return new_features

In [24]:
# train classifier
def build_clf(X, Y):
    clf = RandomForestClassifier()
    clf.fit(X, Y)
    print("Accuracy: ",clf.score(X, Y))
    return clf

#predictions
def predict_clf(X, Y):
    predictions = clf.predict(X)
    print("Accuracy: ",clf.score(X, Y))

## Main

In [26]:
df = pd.read_table('reviews.csv', header=0, error_bad_lines=False, delimiter='|')
#clean dataset
df = remove_nonenglish_reviews(df)
df = remove_duplicates(df)
#split dataset
train, test = train_test_split(df, test_size = 0.3, random_state = 111)

In [28]:
#vectorize train set, form feature list
X_transform_train, features_voc = build_feature_matrices(train['text'])
new_features = clean_features(features_voc)
X_transform_train = build_feature_matrices_voc(train['text'])

In [29]:
#build model
clf = build_clf(X_transform_train, train['label'])  

Accuracy:  0.909193457417


In [30]:
#vectorize test set
X_transform_test = build_feature_matrices_test(test['text'])

In [31]:
#predictions
predictions = predict_clf(X_transform_test,test['label'])

Accuracy:  0.685203306901
