In [None]:
import gensim
gensim.__version__

'4.3.2'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir("/content/drive/MyDrive/Datasets")

In [None]:
import numpy as np
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import  Word2Vec
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')

In [None]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stops = stopwords.words('english')

In [None]:
def preprocess(text_col, size):
    corpus = []
    for i in range(0, size):
        review = re.sub('[^a-zA-Z]', ' ', text_col[i])
        review = review.lower()
        review = review.split()
        #ps = PorterStemmer()
        #review = [ps.stem(word) for word in review if not word in set(stops)]
        review = ' '.join(review)
        review = review.split()
        if review == []:
          review = ["anything"]
        corpus.append(review)
    return corpus

In [None]:
corpus = preprocess(text_col=dataset['Review'],
                    size=dataset.shape[0])

In [None]:
len(corpus)

1000

In [None]:
corpus[:3]

[['wow', 'loved', 'this', 'place'],
 ['crust', 'is', 'not', 'good'],
 ['not', 'tasty', 'and', 'the', 'texture', 'was', 'just', 'nasty']]

CBOW: sg = 0
Skip-Gram: sg = 1

In [None]:
model_r =  Word2Vec(corpus, min_count=1, vector_size=100, sg=0)

In [None]:
means = []
for sentence in corpus :
    sent = np.array([model_r.wv.get_vector(word) for word in sentence ])
    row_means = sent.mean(axis=0)
    means.append(row_means)
means = np.array(means)

X = means
y = dataset.iloc[:, 1]

In [None]:
X.shape

(1000, 100)

In [None]:
y.shape

(1000,)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.20,
                                                    random_state = 23,
                                                    stratify=y)
classifier = RandomForestClassifier(random_state=23, n_estimators=25)
classifier.fit(X_train, y_train)

**Model Evaluation**


In [None]:
y_pred_prob = classifier.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

0.61885


In [None]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.59


#### Grid Search CV

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
print(classifier.get_params())

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 25, 'n_jobs': None, 'oob_score': False, 'random_state': 23, 'verbose': 0, 'warm_start': False}


In [None]:
params = {'max_features':[2,5,10,20,50]}
gcv = GridSearchCV(classifier, param_grid=params, cv=kfold, verbose=3)
gcv.fit(X, y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ....................max_features=2;, score=0.570 total time=   0.1s
[CV 2/5] END ....................max_features=2;, score=0.580 total time=   0.1s
[CV 3/5] END ....................max_features=2;, score=0.580 total time=   0.1s
[CV 4/5] END ....................max_features=2;, score=0.520 total time=   0.1s
[CV 5/5] END ....................max_features=2;, score=0.625 total time=   0.1s
[CV 1/5] END ....................max_features=5;, score=0.550 total time=   0.1s
[CV 2/5] END ....................max_features=5;, score=0.620 total time=   0.1s
[CV 3/5] END ....................max_features=5;, score=0.570 total time=   0.1s
[CV 4/5] END ....................max_features=5;, score=0.570 total time=   0.1s
[CV 5/5] END ....................max_features=5;, score=0.535 total time=   0.1s
[CV 1/5] END ...................max_features=10;, score=0.555 total time=   0.1s
[CV 2/5] END ...................max_features=10;,

In [None]:
print(gcv.best_params_)

{'max_features': 50}


In [None]:
print(gcv.best_score_)

0.5860000000000001


In [None]:
test_corp = ['bad taste', 'horrible','love']
tst_corpus = preprocess(text_col=test_corp,
                    size=3)

print(tst_corpus)

[['bad', 'taste'], ['horrible'], ['love']]


In [None]:
test_means = []
for sentence in tst_corpus :
    word_vects = [model_r.wv.get_vector(word, norm=True) for word in sentence ]
    row_means = np.mean(word_vects,axis=0)
    test_means.append(row_means)
test_means = np.array( test_means )

y_pred = gcv.predict(test_means)
print(y_pred)

[1 1 1]
