In [1]:
import random
import os
import re
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import 
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors
# import nltk
# import gensim
# from gensim.models import Word2Vec
# import xgboost as xgb

In [2]:
# Loading the GLOVE model
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding='utf-8')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [3]:
# Loading pre-trained word2vec model
word2vec = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
# Loading pretrained glove model
glove = loadGloveModel("../../glove.6B/glove.6B.300d.txt")

Loading Glove Model
Done. 400000  words loaded!


In [4]:
# This code is taken from  https://github.com/marcotcr/lime-experiments/blob/master/load_datasets.py and modified a bit
def LoadMultiDomainDataset(path_data, remove_bigrams=True):
    random.seed(1)
    pos = []
    neg = []
    def get_words(line, remove_bigrams=True):
        z = [tuple(x.split(':')) for x in re.findall('\w*?:\d', line)]
        if remove_bigrams:
            z = ' '.join([' '.join([x[0]] * int(x[1])) for x in z if '_' not in x[0]])
        else:
            z = ' '.join([' '.join([x[0]] * int(x[1])) for x in z])
        return z
    for line in open(os.path.join(path_data, 'books_negative.review')):
        neg.append(get_words(line, remove_bigrams))
    for line in open(os.path.join(path_data, 'books_positive.review')):
        pos.append(get_words(line, remove_bigrams))
    data = pos+neg
    labels = [1] * len(pos) + [0]* len(neg)
    return data, np.array(labels).reshape(len(labels),1)

In [5]:
def rf_model(df):
    # seleting X and y and splitting data into train and test datasets
    X = df.iloc[:, :-1]
    y = books_df.iloc[:,-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Grid search for best parameters
    rfc = RandomForestClassifier(random_state=42) 
    param_grid = {
        'n_estimators': [1000],
        'max_depth':[6,8],
        'max_features': ['auto']
    }
    CV = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, verbose=3)
    
    # fitting to trian data and calculating training accuarcy
    CV.fit(X_train, y_train)   
    print('best train accuracy: {:.2f}% & best parameter combination: {}'.format(CV.best_score_*100,CV.best_params_))
    
    # Calculating test accuracy
    y_pred = CV.best_estimator_.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    print('test accuracy: {:.2f}%'.format(test_acc*100))

In [6]:
# Load data
books_reviews, books_targets = LoadMultiDomainDataset('./data')

# classification using word2vec

In [7]:
# Creating mean word2vec embedding for each review
def mean_word2vec_vector(reviews):
    mean_vecs=[]
    for review in reviews:
        vecs=[]
        tokens = review.split()
        for token in tokens:
            if token in word2vec:
                vec = word2vec.word_vec(token)
                vecs.append(vec)
            mean_vec = np.mean(vecs,axis=0)
        mean_vecs.append(mean_vec)
    return np.array(mean_vecs)

In [8]:
# creating dataframe
books_vecs = mean_word2vec_vector(books_reviews)
books_arr = np.hstack((books_vecs,books_targets))
books_df = pd.DataFrame(books_arr)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [9]:
# Building RF model
rf_model(books_df)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.775, total=  11.4s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.3s remaining:    0.0s


[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.831, total=  12.4s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   23.7s remaining:    0.0s


[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.812, total=  14.3s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.787, total=  15.3s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.766, total=  17.0s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.787, total=  13.0s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.822, total=  12.1s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.797, total=  11.6s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.794, 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.2min finished


best train accuracy: 79.69% & best parameter combination: {'max_depth': 8, 'max_features': 'auto', 'n_estimators': 1000}
test accuracy: 79.75%


# classification using GLove Embeddings

In [10]:
# creating mean glove embeddings for each review
def mean_glove_vector(reviews):
    mean_vecs=[]
    for review in reviews:
        vecs=[]
        tokens = review.split()
        for token in tokens:
            if token in glove.keys():
                vec = glove[token]
                vecs.append(vec)
            mean_vec = np.mean(vecs,axis=0)
        mean_vecs.append(mean_vec)
    return np.array(mean_vecs)

In [11]:
# creating dataframe
books_vecs = mean_glove_vector(books_reviews)
books_arr = np.hstack((books_vecs,books_targets))
books_df = pd.DataFrame(books_arr)

# Building RF model
rf_model(books_df)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.775, total=   8.9s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.8s remaining:    0.0s


[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.806, total=   9.8s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.6s remaining:    0.0s


[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.787, total=  10.4s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.756, total=   9.1s
[CV] max_depth=6, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=6, max_features=auto, n_estimators=1000, score=0.762, total=  10.3s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.766, total=  12.1s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.800, total=  13.2s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.781, total=  14.0s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.772, 

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.9min finished


best train accuracy: 77.88% & best parameter combination: {'max_depth': 8, 'max_features': 'auto', 'n_estimators': 1000}
test accuracy: 77.00%
