In [0]:
import re
import pandas as pd
import numpy as np
import json
import logging
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [0]:
DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA, header=0, delimiter=",", quoting=3)


reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
  sentences.append(review.split())

In [0]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

In [0]:
logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers,
                         size=num_features,
                         min_count=min_word_count,
                         window = context,
                         sample = downsampling)

In [0]:
model_name='300features_40minwords_10context'
model.save(model_name)

2019-05-19 15:53:00,001: INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2019-05-19 15:53:00,003: INFO : not storing attribute vectors_norm
2019-05-19 15:53:00,005: INFO : not storing attribute cum_table
2019-05-19 15:53:00,329: INFO : saved 300features_40minwords_10context


In [0]:
def get_features(words, model, num_features):
  feature_vector = np.zeros((num_features), dtype=np.float32)
  
  num_words = 0
  
  index2word_set = set(model.wv.index2word)
  
  
  for w in words:
    if w in index2word_set:
      num_words += 1
      
      feature_vector = np.add(feature_vector, model[w])
      
  feature_vector = np.divide(feature_vector, num_words)
  return feature_vector

In [0]:
def get_dataset(reviews, model, num_features):
  dataset = list()
  for s in reviews:
    dataset.append(get_features(s, model, num_features))
    
  reviewFeatureVecs = np.stack(dataset)
  
  return reviewFeatureVecs

In [0]:
train_data_vecs = get_dataset(sentences, model, num_features)

  del sys.path[0]


In [0]:
X = train_data_vecs
y = np.array(sentiments)

RANDOM_SEED = 42
TEST_SPLIT = 0.2

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [0]:
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

print("Accuracy {}".format( lgs.score(X_eval, y_eval)))



Accuracy 0.866
