In [78]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords


class Model:
    def __init__(self, vector_size=100, window=5, min_count=1):

        self.w2v_model = None
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.classifier = LogisticRegression()

    # tokenize a review
    def tokenize(self, text):

        stop_words = stopwords.words('english')
        wrds = str(text).split()
        wrds = [word for word in wrds if word not in stop_words]
        return wrds
    
    # w2v train
    def train_word2vec(self, review_tokens):

        self.w2v_model = Word2Vec(sentences=review_tokens, 
                         vector_size=self.vector_size, 
                         window=self.window, 
                         min_count=self.window,  
                         workers=4)
        
    # w2v conversion
    def w2v(self, word):
        if word in self.w2v_model.wv:
            return self.w2v_model.wv[word]
        else:
            return np.zeros(100)
    
    # train classifier (reviews allready tokenized)
    def train_classifier(self, reviews, labels):
        
        x = self.average_reviews(reviews)

        self.classifier.fit(x, labels)

    # predict
    def predict(self, reviews):

        x = self.average_reviews([self.tokenize(review) for review in reviews])

        predicted = []
        for review in x:
            rev = np.array(review)
            predicted += [self.classifier.predict(rev.reshape(1, -1))]
        
        return predicted

        
    # averages all w2v of a review to obtain a vector representing the review
    def rev2vec(self, review):

        average = np.zeros(100)
        word_count = 0
        
        for word in review:
            if len(average) == 0:
                average = word
            else:
                average = np.add(average, word)
            if np.all(word == 0):
                word_count += 1
        if word_count > 0:
            average = average/word_count

        return average

    # average reviews
    def average_reviews(self, reviews):

        averaged_reviews = []

        for review in reviews:
            w2v_mat = []
            for word in review:
                w2v_mat += [self.w2v(word)]
            averaged_reviews += [self.rev2vec(w2v_mat)]

        return averaged_reviews

Test and Train Sets

In [79]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

X_train, y_train, X_test, y_test = train.iloc[:,1], train.iloc[:,0], test.iloc[:,1], test.iloc[:,0] 

Train the model (w2v and LR with the same data)

In [80]:
model = Model()

w2v_tokens = [model.tokenize(review) for review in X_train]

model.train_word2vec(w2v_tokens)

Train the Logistic Regression classifier

In [81]:
model.train_classifier(w2v_tokens, y_train)

Prediction

In [82]:
pred = model.predict(X_test)

In [84]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)


0.7832569581423954