In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import spacy

In [2]:
nlp=spacy.load('en_core_web_md')

In [3]:
df=pd.read_csv('amazonreviews.tsv',sep='\t')

In [4]:
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [5]:
from sklearn.preprocessing import LabelEncoder 

In [6]:
enc=LabelEncoder()

In [7]:
y=enc.fit_transform(df.label)[:1000]

In [8]:
X=df.review.values[:1000]

In [9]:
from gensim.parsing.preprocessing import strip_punctuation
reviews=[]
for t in X:
        reviews.append([i.lemma_ for i in nlp(strip_punctuation(t.lower())) if not i.is_space and not i.like_url and not i.is_stop and not i.like_email and len(i.lemma_)>1])



In [10]:
import pickle as pkl
from sklearn.linear_model import LogisticRegression
def loadData(file): 
    # for reading also binary mode is important 
    dbfile = open(file, 'rb')      
    db = pkl.load(dbfile) 
    dbfile.close()
    return db
def saveData(file,data): 
    # for reading also binary mode is important 
    dbfile = open(file, 'ab')      
    db = pkl.dump(data,dbfile) 
    dbfile.close()
    return db

### TFIDF 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
#TF-IDF # needs a list of lists for words and docs along with a fasttext 'model'
text = []
for i in reviews:
    string = ' '.join(i)
    text.append(string)
tf_idf_vect = TfidfVectorizer(stop_words=None)
final_tf_idf = tf_idf_vect.fit_transform(text)
tfidf_feat = tf_idf_vect.get_feature_names()

In [12]:
final_tf_idf

<1000x6646 sparse matrix of type '<class 'numpy.float64'>'
	with 30365 stored elements in Compressed Sparse Row format>

## Training FastText Model

In [13]:
from gensim.models import FastText
model_ft = FastText(reviews, size=20, window=5, min_count=1, iter=10, sorted_vocab=1)

### Averaging Fast Text Embeddings to get Doc Embeddings

In [14]:
from sklearn.model_selection import train_test_split
X_g=[]
for r in reviews:
    num_words=len(r)
    sum_words=0
    for w in r:
        sum_words+=model_ft.wv[w]
    X_g.append(sum_words/num_words)
X_g=np.array(X_g)
X_train,X_test,y_train,y_test=train_test_split(X_g,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))



              precision    recall  f1-score   support

           0       0.56      0.59      0.57        87
           1       0.48      0.45      0.46        73

   micro avg       0.53      0.53      0.53       160
   macro avg       0.52      0.52      0.52       160
weighted avg       0.52      0.53      0.52       160

              precision    recall  f1-score   support

           0       0.57      0.55      0.56        87
           1       0.49      0.51      0.50        73

   micro avg       0.53      0.53      0.53       160
   macro avg       0.53      0.53      0.53       160
weighted avg       0.53      0.53      0.53       160



  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.54      1.00      0.70        87
           1       0.00      0.00      0.00        73

   micro avg       0.54      0.54      0.54       160
   macro avg       0.27      0.50      0.35       160
weighted avg       0.30      0.54      0.38       160



### Combining TF-IDF with FastText Word Vectors

In [16]:
tfidf_sent_vectors_ft = [] # the tfidf-ft for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(20) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = model_ft.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_ft.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

errors noted: 1


In [17]:
X_ft=np.array(tfidf_sent_vectors_ft)

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X_ft,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [19]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))



              precision    recall  f1-score   support

           0       0.50      0.91      0.65        77
           1       0.67      0.17      0.27        83

   micro avg       0.53      0.53      0.53       160
   macro avg       0.59      0.54      0.46       160
weighted avg       0.59      0.53      0.45       160



In [20]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.50      0.64      0.56        77
           1       0.55      0.41      0.47        83

   micro avg       0.52      0.52      0.52       160
   macro avg       0.52      0.52      0.51       160
weighted avg       0.53      0.52      0.51       160



In [21]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

              precision    recall  f1-score   support

           0       0.53      0.66      0.59        77
           1       0.59      0.45      0.51        83

   micro avg       0.55      0.55      0.55       160
   macro avg       0.56      0.55      0.55       160
weighted avg       0.56      0.55      0.55       160



### Training Skip Gram Model

In [22]:
from gensim.models import Word2Vec
w2v=Word2Vec(reviews,size=20,window=5,sg=1,min_count=1)

W0924 15:23:28.380835 12412 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


### Averaging SkipGram Text Embeddings to get Doc Embeddings

In [23]:
X_g=[]
for r in reviews:
    num_words=len(r)
    sum_words=0
    for w in r:
        sum_words+=w2v.wv[w]
    X_g.append(sum_words/num_words)
X_g=np.array(X_g)
X_train,X_test,y_train,y_test=train_test_split(X_g,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [24]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))



              precision    recall  f1-score   support

           0       0.56      1.00      0.72        89
           1       1.00      0.01      0.03        71

   micro avg       0.56      0.56      0.56       160
   macro avg       0.78      0.51      0.37       160
weighted avg       0.76      0.56      0.41       160



In [25]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.61      0.71      0.66        89
           1       0.54      0.44      0.48        71

   micro avg       0.59      0.59      0.59       160
   macro avg       0.58      0.57      0.57       160
weighted avg       0.58      0.59      0.58       160

              precision    recall  f1-score   support

           0       0.64      0.69      0.66        89
           1       0.56      0.51      0.53        71

   micro avg       0.61      0.61      0.61       160
   macro avg       0.60      0.60      0.60       160
weighted avg       0.60      0.61      0.60       160



### Combining TF-IDF with Word2Vec SkipGram

In [26]:
tfidf_sent_vectors_sk = [] # the tfidf-sk for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(20) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_sk.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

X_sk=np.array(tfidf_sent_vectors_sk)

X_train,X_test,y_train,y_test=train_test_split(X_sk,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

errors noted: 1




              precision    recall  f1-score   support

           0       0.52      0.99      0.68        83
           1       0.50      0.01      0.03        77

   micro avg       0.52      0.52      0.52       160
   macro avg       0.51      0.50      0.35       160
weighted avg       0.51      0.52      0.37       160





              precision    recall  f1-score   support

           0       0.57      0.64      0.60        83
           1       0.55      0.48      0.51        77

   micro avg       0.56      0.56      0.56       160
   macro avg       0.56      0.56      0.56       160
weighted avg       0.56      0.56      0.56       160

              precision    recall  f1-score   support

           0       0.57      0.57      0.57        83
           1       0.54      0.55      0.54        77

   micro avg       0.56      0.56      0.56       160
   macro avg       0.56      0.56      0.56       160
weighted avg       0.56      0.56      0.56       160



### Training Word2Vec CBOW

In [27]:
w2v_cbow=Word2Vec(reviews,size=20,window=5,sg=0,min_count=1)

W0924 15:23:37.351999 12412 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


### Averaging CBOW Vectors

In [28]:
X_g=[]
for r in reviews:
    num_words=len(r)
    sum_words=0
    for w in r:
        sum_words+=w2v_cbow.wv[w]
    X_g.append(sum_words/num_words)
X_g=np.array(X_g)
X_train,X_test,y_train,y_test=train_test_split(X_g,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)


lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.57      1.00      0.73        91
           1       0.00      0.00      0.00        69

   micro avg       0.57      0.57      0.57       160
   macro avg       0.28      0.50      0.36       160
weighted avg       0.32      0.57      0.41       160





              precision    recall  f1-score   support

           0       0.61      0.62      0.61        91
           1       0.49      0.48      0.48        69

   micro avg       0.56      0.56      0.56       160
   macro avg       0.55      0.55      0.55       160
weighted avg       0.56      0.56      0.56       160

              precision    recall  f1-score   support

           0       0.57      0.57      0.57        91
           1       0.43      0.42      0.42        69

   micro avg       0.51      0.51      0.51       160
   macro avg       0.50      0.50      0.50       160
weighted avg       0.51      0.51      0.51       160



### Combine Word Vectors with TF-IDF

In [29]:
tfidf_sent_vectors_cbow = [] # the tfidf-sk for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(20) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_cbow.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_cbow.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

X_cbow=np.array(tfidf_sent_vectors_cbow)

X_train,X_test,y_train,y_test=train_test_split(X_cbow,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)


lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

errors noted: 1


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.51      1.00      0.68        82
           1       0.00      0.00      0.00        78

   micro avg       0.51      0.51      0.51       160
   macro avg       0.26      0.50      0.34       160
weighted avg       0.26      0.51      0.35       160





              precision    recall  f1-score   support

           0       0.50      0.65      0.56        82
           1       0.46      0.32      0.38        78

   micro avg       0.49      0.49      0.49       160
   macro avg       0.48      0.48      0.47       160
weighted avg       0.48      0.49      0.47       160

              precision    recall  f1-score   support

           0       0.55      0.66      0.60        82
           1       0.54      0.42      0.47        78

   micro avg       0.54      0.54      0.54       160
   macro avg       0.54      0.54      0.54       160
weighted avg       0.54      0.54      0.54       160



### Processing Text for Embedding Layer

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [31]:
tokenizer=Tokenizer()

In [32]:
tokenizer.fit_on_texts(reviews)

In [33]:
V=len(tokenizer.word_index)+1

In [34]:
tokenizer.word_index['PAD']=0

In [35]:
max_len=max([len(s) for s in reviews])

In [36]:
word2id = tokenizer.word_index
word2id['PAD']=0
id2word={v:k for k,v in word2id.items()}

In [37]:
revs=tokenizer.texts_to_sequences(reviews)
X=pad_sequences(revs,maxlen=max_len)

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score

In [39]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

### GloVe Embedding

In [35]:
rev_idx=X

In [124]:
# co-occurence matrix
X = np.zeros((V, V))
N = 1000

In [126]:
it = 0
context_size=5
for s in rev_idx:
    for i in range(len(s)):
        wi=s[i] # select current word
        start= max(0,i-context_size) # define start index
        end = min(100,i+context_size) # define end index of the context

        if i - context_size < 0:
            points = 1.0/(i+1) # calculate context distances 
            X[wi,0]+=points
            X[0,wi]+=points
            
        if i + context_size > 100:
            points = 1.0 / (100 - i)
            X[wi,1] += points
            X[1,wi] += points

        for j in range(start,i):
            wj = s[j]
            points = 1.0 / (i - j) # this is +ve
            X[wi,wj] += points
            X[wj,wi] += points
            
        # right side
        for j in range(i + 1, end):
            wj = s[j]
            points = 1.0 / (j - i) # this is +ve
            X[wi,wj] += points
            X[wj,wi] += points

In [134]:
# initialize weight matrix
fX=np.zeros((V,V))
fX[X<100]=(X[X<100]/float(100))**0.75
fX[X>=100]=1
# target
logX = np.log(X + 1)
D=20

In [142]:
class Glove(tf.keras.Model):
    def __init__(self, num_dims, vocab_size,mu):
        super(Glove, self).__init__()
        # initialize weights
        W = np.random.randn(V, D) / np.sqrt(V + D)
        b = np.zeros(V)
        U = np.random.randn(V, D) / np.sqrt(V + D)
        c = np.zeros(V)
        self.mu = mu
        # initialize weights, inputs, targets placeholders
        self.W = tf.Variable(W.astype(np.float32))
        self.b = tf.Variable(b.reshape(V, 1).astype(np.float32))
        self.U = tf.Variable(U.astype(np.float32))
        self.c = tf.Variable(c.reshape(1, V).astype(np.float32))
        self.params = [self.W, self.b,self.U,self.c]

    def call(self,inputs):
        return tf.matmul(self.W, tf.transpose(self.U)) + self.b + self.c + self.mu

# Define the loss
def get_loss(model, inputs, targets):
    predictions = model(inputs)
    delta = targets - predictions
    return tf.reduce_sum(inputs * delta * delta)

# Gradient function
def get_grad(model, inputs, targets):
    with tf.GradientTape() as tape:
        # calculate the loss
        loss_value = get_loss(model, inputs, targets)
        # return gradient
        return tape.gradient(loss_value, model.params)

In [143]:
mu = logX.mean()
glove_model=Glove(20,V,mu)

In [None]:
# Store the losses here
losses = []

In [144]:
# Create an optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Run the training loop
for i in range(200):
    # Get gradients
    grads = get_grad(glove_model, fX, logX)

    # Do one step of gradient descent: param <- param - learning_rate * grad
    optimizer.apply_gradients(zip(grads, glove_model.params))

    # Store the loss
    loss = get_loss(glove_model, fX, logX)
    losses.append(loss)
    print(i," ",loss)

W0916 18:33:45.195358 14080 base_layer.py:1772] Layer glove_3 is casting an input tensor from dtype float64 to the layer's dtype of float32, which is new behavior in TensorFlow 2.  The layer has dtype float32 because it's dtype defaults to floatx.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



0   tf.Tensor(13577.893, shape=(), dtype=float32)
1   tf.Tensor(13189.344, shape=(), dtype=float32)
2   tf.Tensor(12783.364, shape=(), dtype=float32)
3   tf.Tensor(12351.918, shape=(), dtype=float32)
4   tf.Tensor(11891.551, shape=(), dtype=float32)
5   tf.Tensor(11401.018, shape=(), dtype=float32)
6   tf.Tensor(10880.328, shape=(), dtype=float32)
7   tf.Tensor(10330.77, shape=(), dtype=float32)
8   tf.Tensor(9755.085, shape=(), dtype=float32)
9   tf.Tensor(9157.549, shape=(), dtype=float32)
10   tf.Tensor(8544.055, shape=(), dtype=float32)
11   tf.Tensor(7922.18, shape=(), dtype=float32)
12   tf.Tensor(7301.2637, shape=(), dtype=float32)
13   tf.Tensor(6692.4316, shape=(), dtype=float32)
14   tf.Tensor(6108.5107, shape=(), dtype=float32)
15   tf.Tensor(5563.7354, shape=(), dtype=float32)
16   tf.Tensor(5073.0103, shape=(), dtype=float32)
17   tf.Tensor(4650.4575, shape=(), dtype=float32)
18   tf.Tensor(4306.875, shape=(), dtype=float32)
19   tf.Tensor(4046.212, shape=(), dtype=float32

In [149]:
W_f,_,U_f,_=glove_model.params
W1,W2=[W_f.numpy(),U_f.numpy().T]
We = np.hstack([W1, W2.T])
We_avg = (W1 + W2.T) / 2

In [153]:
We_avg.shape

(6647, 20)

In [40]:
glove_emb_avg=loadData('glove_avg.pkl')
glove_emb_concat=loadData('glove_concat.pkl')

### Averaging GloVe Embeddings to get Sentence Embeddings

In [41]:
X_g=[]
for r in revs:
    num_words=len(r)
    sum_words=0
    for w in r:
        sum_words+=glove_emb_avg[w]
    X_g.append(sum_words/num_words)
X_g=np.array(X_g)
X_train,X_test,y_train,y_test=train_test_split(X_g,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [42]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.58      0.81      0.68        81
           1       0.68      0.41      0.51        79

   micro avg       0.61      0.61      0.61       160
   macro avg       0.63      0.61      0.59       160
weighted avg       0.63      0.61      0.60       160





              precision    recall  f1-score   support

           0       0.60      0.78      0.68        81
           1       0.67      0.47      0.55        79

   micro avg       0.62      0.62      0.62       160
   macro avg       0.64      0.62      0.61       160
weighted avg       0.64      0.62      0.62       160

              precision    recall  f1-score   support

           0       0.59      0.64      0.62        81
           1       0.60      0.54      0.57        79

   micro avg       0.59      0.59      0.59       160
   macro avg       0.59      0.59      0.59       160
weighted avg       0.59      0.59      0.59       160



### Concated GloVe Embeddings which are averaged to get Sentence Embeddings

In [43]:
X_g=[]
for r in revs:
    num_words=len(r)
    sum_words=0
    for w in r:
        sum_words+=glove_emb_concat[w]
    X_g.append(sum_words/num_words)
X_g=np.array(X_g)
X_train,X_test,y_train,y_test=train_test_split(X_g,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.65      0.73      0.69        84
           1       0.65      0.57      0.61        76

   micro avg       0.65      0.65      0.65       160
   macro avg       0.65      0.65      0.65       160
weighted avg       0.65      0.65      0.65       160





              precision    recall  f1-score   support

           0       0.64      0.75      0.69        84
           1       0.66      0.54      0.59        76

   micro avg       0.65      0.65      0.65       160
   macro avg       0.65      0.64      0.64       160
weighted avg       0.65      0.65      0.65       160

              precision    recall  f1-score   support

           0       0.65      0.71      0.68        84
           1       0.65      0.58      0.61        76

   micro avg       0.65      0.65      0.65       160
   macro avg       0.65      0.65      0.65       160
weighted avg       0.65      0.65      0.65       160



### Combining GloVe with TF-IDF

#### Using Averaged GloVe Embeddings

In [44]:
tfidf_sent_vectors_gl = [] # the tfidf-sk for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(20) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = glove_emb_avg[word2id[word]]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_gl.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

errors noted: 1


In [45]:
X_skip=np.array(tfidf_sent_vectors_gl)

X_train,X_test,y_train,y_test=train_test_split(X_skip,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.57      0.79      0.66        82
           1       0.62      0.36      0.46        78

   micro avg       0.58      0.58      0.58       160
   macro avg       0.59      0.58      0.56       160
weighted avg       0.59      0.58      0.56       160





              precision    recall  f1-score   support

           0       0.60      0.76      0.67        82
           1       0.65      0.47      0.55        78

   micro avg       0.62      0.62      0.62       160
   macro avg       0.63      0.62      0.61       160
weighted avg       0.62      0.62      0.61       160

              precision    recall  f1-score   support

           0       0.57      0.66      0.61        82
           1       0.58      0.49      0.53        78

   micro avg       0.57      0.57      0.57       160
   macro avg       0.58      0.57      0.57       160
weighted avg       0.58      0.57      0.57       160



#### Using Concatenated GloVe Embeddings

In [46]:
tfidf_sent_vectors_gl = [] # the tfidf-sk for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(40) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = glove_emb_concat[word2id[word]]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_gl.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

errors noted: 1


In [47]:
X_skip=np.array(tfidf_sent_vectors_gl)

X_train,X_test,y_train,y_test=train_test_split(X_skip,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.57      0.71      0.63        85
           1       0.54      0.39      0.45        75

   micro avg       0.56      0.56      0.56       160
   macro avg       0.55      0.55      0.54       160
weighted avg       0.55      0.56      0.54       160





              precision    recall  f1-score   support

           0       0.56      0.69      0.62        85
           1       0.52      0.37      0.43        75

   micro avg       0.54      0.54      0.54       160
   macro avg       0.54      0.53      0.53       160
weighted avg       0.54      0.54      0.53       160

              precision    recall  f1-score   support

           0       0.66      0.74      0.70        85
           1       0.66      0.57      0.61        75

   micro avg       0.66      0.66      0.66       160
   macro avg       0.66      0.66      0.66       160
weighted avg       0.66      0.66      0.66       160



### Jointly Learnt Embeddings

In [48]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [51]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Flatten,LSTM,Dropout,GlobalMaxPool1D
from tensorflow.keras.optimizers import Adam

In [52]:
model=Sequential()
model.add(Embedding(input_dim=V,output_dim=20,input_length=max_len))
model.add(LSTM(5,return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(15, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(5, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
opt=tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss = "mse", optimizer = opt, metrics=["accuracy"])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 20)           132940    
_________________________________________________________________
lstm (LSTM)                  (None, 100, 5)            520       
_________________________________________________________________
global_max_pooling1d (Global (None, 5)                 0         
_________________________________________________________________
flatten (Flatten)            (None, 5)                 0         
_________________________________________________________________
dropout (Dropout)            (None, 5)                 0         
_________________________________________________________________
dense (Dense)                (None, 15)                90        
_________________________________________________________________
dropout_1 (Dropout)          (None, 15)                0

In [53]:
model.fit(X_train,y_train,batch_size=32,epochs=20,validation_data=[X_val,y_val])

Train on 640 samples, validate on 160 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2aa9d622198>

In [54]:
emb=model.layers[0].get_weights()[0]

In [55]:
preds=np.round(model.predict(X_test))
print(classification_report(y_test,preds))
print(accuracy_score(y_test,preds))

              precision    recall  f1-score   support

           0       0.77      0.79      0.78       106
           1       0.76      0.73      0.75        94

   micro avg       0.77      0.77      0.77       200
   macro avg       0.76      0.76      0.76       200
weighted avg       0.76      0.77      0.76       200

0.765


In [56]:
saveData('LearntEmb.pkl',emb)

### Combining Jointly Learnt Word Embeddings to Form Sentence Embeddings

In [49]:
emb=loadData('LearntEmb.pkl')
X=[]
for r in revs:
    num_words=len(r)
    sum_words=0
    for w in r:
        sum_words+=emb[w]
    X.append(sum_words/num_words)
X=np.array(X)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

IndexError: index 6646 is out of bounds for axis 0 with size 6646

In [50]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.69      0.57      0.62       100
           1       0.44      0.57      0.50        60

   micro avg       0.57      0.57      0.57       160
   macro avg       0.56      0.57      0.56       160
weighted avg       0.59      0.57      0.58       160





              precision    recall  f1-score   support

           0       0.61      0.65      0.63       100
           1       0.34      0.30      0.32        60

   micro avg       0.52      0.52      0.52       160
   macro avg       0.47      0.47      0.47       160
weighted avg       0.51      0.52      0.51       160

              precision    recall  f1-score   support

           0       0.64      0.63      0.64       100
           1       0.40      0.42      0.41        60

   micro avg       0.55      0.55      0.55       160
   macro avg       0.52      0.52      0.52       160
weighted avg       0.55      0.55      0.55       160



### Combining Word Vectors with TF-IDF to form Sentence Vectors

In [51]:
tfidf_sent_vectors_learnt = [] # the tfidf-sk for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(20) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = emb[word2id[word]]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_learnt.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

errors noted: 1


In [52]:
X_skip=np.array(tfidf_sent_vectors_learnt)

X_train,X_test,y_train,y_test=train_test_split(X_skip,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.89      0.92      0.91        92
           1       0.89      0.85      0.87        68

   micro avg       0.89      0.89      0.89       160
   macro avg       0.89      0.89      0.89       160
weighted avg       0.89      0.89      0.89       160





              precision    recall  f1-score   support

           0       0.83      0.92      0.87        92
           1       0.88      0.74      0.80        68

   micro avg       0.84      0.84      0.84       160
   macro avg       0.85      0.83      0.84       160
weighted avg       0.85      0.84      0.84       160

              precision    recall  f1-score   support

           0       0.84      0.90      0.87        92
           1       0.85      0.76      0.81        68

   micro avg       0.84      0.84      0.84       160
   macro avg       0.85      0.83      0.84       160
weighted avg       0.84      0.84      0.84       160



### Using just TF-IDF Features for Classification

In [53]:
X_tfidf=final_tf_idf

In [54]:
X_train,X_test,y_train,y_test=train_test_split(X_tfidf,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [55]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.78      0.88      0.83        90
           1       0.81      0.69      0.74        70

   micro avg       0.79      0.79      0.79       160
   macro avg       0.80      0.78      0.79       160
weighted avg       0.80      0.79      0.79       160





              precision    recall  f1-score   support

           0       0.71      0.83      0.77        90
           1       0.73      0.57      0.64        70

   micro avg       0.72      0.72      0.72       160
   macro avg       0.72      0.70      0.70       160
weighted avg       0.72      0.72      0.71       160

              precision    recall  f1-score   support

           0       0.83      0.86      0.84        90
           1       0.81      0.77      0.79        70

   micro avg       0.82      0.82      0.82       160
   macro avg       0.82      0.81      0.81       160
weighted avg       0.82      0.82      0.82       160



### Training Doc2Vec

In [56]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews)]

### PV-DM

In [57]:
model_d2v = Doc2Vec(documents,dm=1,vector_size=20, window=5, min_count=1, workers=4)

W0924 15:24:56.224439 12412 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


In [58]:
X_docs=model_d2v.docvecs.vectors_docs

In [59]:
X_train,X_test,y_train,y_test=train_test_split(X_docs,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.54      1.00      0.70        86
           1       0.00      0.00      0.00        74

   micro avg       0.54      0.54      0.54       160
   macro avg       0.27      0.50      0.35       160
weighted avg       0.29      0.54      0.38       160





              precision    recall  f1-score   support

           0       0.54      0.78      0.64        86
           1       0.49      0.24      0.32        74

   micro avg       0.53      0.53      0.53       160
   macro avg       0.52      0.51      0.48       160
weighted avg       0.52      0.53      0.49       160

              precision    recall  f1-score   support

           0       0.56      0.72      0.63        86
           1       0.52      0.35      0.42        74

   micro avg       0.55      0.55      0.55       160
   macro avg       0.54      0.54      0.53       160
weighted avg       0.54      0.55      0.53       160



### PV-DBOW

In [60]:
model_d2v = Doc2Vec(documents,dm=0,vector_size=20, window=5, min_count=1, workers=4)

W0924 15:25:02.013295 12412 base_any2vec.py:686] under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay


In [61]:
X_docs=model_d2v.docvecs.vectors_docs

In [62]:
X_train,X_test,y_train,y_test=train_test_split(X_docs,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)


lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.56      1.00      0.71        89
           1       0.00      0.00      0.00        71

   micro avg       0.56      0.56      0.56       160
   macro avg       0.28      0.50      0.36       160
weighted avg       0.31      0.56      0.40       160





              precision    recall  f1-score   support

           0       0.55      0.74      0.63        89
           1       0.42      0.24      0.31        71

   micro avg       0.52      0.52      0.52       160
   macro avg       0.49      0.49      0.47       160
weighted avg       0.49      0.52      0.49       160

              precision    recall  f1-score   support

           0       0.56      0.64      0.60        89
           1       0.45      0.37      0.40        71

   micro avg       0.52      0.52      0.52       160
   macro avg       0.50      0.50      0.50       160
weighted avg       0.51      0.52      0.51       160



### Elmo Embeddings

In [20]:
import tensorflow_hub as hub

In [21]:
tf.compat.v1.disable_eager_execution()
sess = tf.compat.v1.Session()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
def elmo_vectors(x):
    embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
    with tf.compat.v1.Session() as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        sess.run(tf.compat.v1.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [176]:
words=list(tokenizer.index_word.values())

In [130]:
#saveData('Words.pkl',words)

In [63]:
words=loadData('Words.pkl')

In [26]:
elmo_word_embs=[]

In [27]:
b_s=32
for i in range(177,208):
    elmo_word_embs.append(elmo_vectors(words[i*b_s:i*b_s+b_s]))
    print(i)

177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207


In [31]:
elmo_word_embs=np.concatenate(elmo_word_embs)

In [45]:
#saveData('ElmoWordEmb.pkl',word_embs_full)

In [66]:
word_embs_full=loadData('ElmoWordEmb.pkl')

In [67]:
texts=[" ".join(i) for i in reviews]

In [68]:
word_embs_full.shape

(6646, 1024)

In [69]:
word_embs_full[words.index('hi')]

array([-0.47999403, -0.00448739, -0.13489588, ..., -0.2204245 ,
       -0.19793135,  0.01283459], dtype=float32)

### Averaging ELMo word representations

In [70]:
final_tf_idf

<1000x6646 sparse matrix of type '<class 'numpy.float64'>'
	with 30365 stored elements in Compressed Sparse Row format>

In [71]:
X_g=[]
for r in reviews:
    num_words=len(r)
    sum_words=0
    for w in r:
        if w != '-PRON-':
            sum_words+=word_embs_full[words.index(w)]
    X_g.append(sum_words/num_words) 
X_g=np.array(X_g)
X_train,X_test,y_train,y_test=train_test_split(X_g,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [72]:
lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.74      0.77      0.75        81
           1       0.75      0.72      0.74        79

   micro avg       0.74      0.74      0.74       160
   macro avg       0.74      0.74      0.74       160
weighted avg       0.74      0.74      0.74       160





              precision    recall  f1-score   support

           0       0.62      0.84      0.71        81
           1       0.74      0.47      0.57        79

   micro avg       0.66      0.66      0.66       160
   macro avg       0.68      0.65      0.64       160
weighted avg       0.68      0.66      0.64       160

              precision    recall  f1-score   support

           0       0.75      0.81      0.78        81
           1       0.79      0.72      0.75        79

   micro avg       0.77      0.77      0.77       160
   macro avg       0.77      0.77      0.77       160
weighted avg       0.77      0.77      0.77       160



### Combining ELMo word representations with TF-IDF

In [73]:
tfidf_sent_vectors_elmo= [] # the tfidf-sk for each sentence/review is stored in this list
row=0
errors=0
for sent in reviews: # for each review/sentence
    sent_vec = np.zeros(1024) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = word_embs_full[words.index(word)]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tfidf)
            weight_sum += tfidf
        except:
            errors =+1
            pass
    sent_vec /= weight_sum
    #print(np.isnan(np.sum(sent_vec)))

    tfidf_sent_vectors_elmo.append(sent_vec)
    row += 1
print('errors noted: '+str(errors))

errors noted: 1


In [74]:
X_skip=np.array(tfidf_sent_vectors_elmo)
X_train,X_test,y_train,y_test=train_test_split(X_skip,y,test_size=0.2)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

lreg = LogisticRegression()
lreg.fit(X_train,y_train)
preds_valid = lreg.predict(X_val)
print(classification_report(y_val,preds_valid))

rf=RandomForestClassifier()
rf.fit(X_train,y_train)
preds=rf.predict(X_val)
print(classification_report(y_val,preds))

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds=xgb.predict(X_val)
print(classification_report(y_val,preds))



              precision    recall  f1-score   support

           0       0.72      0.84      0.78        77
           1       0.83      0.70      0.76        83

   micro avg       0.77      0.77      0.77       160
   macro avg       0.78      0.77      0.77       160
weighted avg       0.78      0.77      0.77       160





              precision    recall  f1-score   support

           0       0.56      0.75      0.64        77
           1       0.67      0.46      0.54        83

   micro avg       0.60      0.60      0.60       160
   macro avg       0.61      0.61      0.59       160
weighted avg       0.62      0.60      0.59       160

              precision    recall  f1-score   support

           0       0.67      0.82      0.74        77
           1       0.79      0.63      0.70        83

   micro avg       0.72      0.72      0.72       160
   macro avg       0.73      0.72      0.72       160
weighted avg       0.73      0.72      0.72       160



### ELMo Doc Representations

In [75]:
elm_emb=loadData('ElmoEmbeddings.pkl')
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(elm_emb, y, random_state=42, test_size=0.2)

In [76]:
lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)
preds_valid = lreg.predict(xvalid)
print(classification_report(yvalid,preds_valid))

xgb=XGBClassifier()
xgb.fit(xtrain,ytrain)
preds=xgb.predict(xvalid)
print(classification_report(yvalid,preds))

rf=RandomForestClassifier()
rf.fit(xtrain,ytrain)
preds=rf.predict(xvalid)
print(classification_report(yvalid,preds))



              precision    recall  f1-score   support

           0       0.88      0.90      0.89       115
           1       0.86      0.84      0.85        85

   micro avg       0.87      0.87      0.87       200
   macro avg       0.87      0.87      0.87       200
weighted avg       0.87      0.87      0.87       200

              precision    recall  f1-score   support

           0       0.87      0.82      0.84       115
           1       0.77      0.84      0.80        85

   micro avg       0.82      0.82      0.82       200
   macro avg       0.82      0.83      0.82       200
weighted avg       0.83      0.82      0.83       200





              precision    recall  f1-score   support

           0       0.72      0.79      0.76       115
           1       0.68      0.59      0.63        85

   micro avg       0.70      0.70      0.70       200
   macro avg       0.70      0.69      0.69       200
weighted avg       0.70      0.70      0.70       200



### Bert Doc Representations
Requires Pytorch and pytorch BERT model.

#### Text Preprocessing for BERT

In [77]:
# !pip install pytorch-pretrained-bert

In [78]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
X=df.review[:1000].apply(str.replace,args=('.','. [SEP] ')).values
marked_X=[]
for x in X:
    marked_X.append('[CLS] '+x+ (' [SEP]' if x[-6:]!='[SEP] ' else " "))
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenized_texts=[]
for x in marked_X:
    tokenized_text = tokenizer.tokenize(x)
    tokenized_texts.append(tokenized_text)

tokenized_indexed_texts=[]
for x in tokenized_texts:
    indexed_tokens = tokenizer.convert_tokens_to_ids(x)
    tokenized_indexed_texts.append(indexed_tokens)

In [79]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

#### Document Embedding using average of the second last Layer

In [None]:
doc_embeddings=[]
ctr=0
for t in np.array(tokenized_indexed_texts):
    ctr+=1
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([t])
    segments_tensors = torch.tensor([[1]*len(t)])
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
    sentence_embedding = torch.mean(encoded_layers[11], 1)
    doc_embeddings.append(np.array(sentence_embedding.tolist()))
    if ctr%30==0:
        print(ctr)
#saveData('BertEmbWOPad.pkl',doc_embeddings)

In [80]:
doc_embeddings=loadData('BertEmbWOPad.pkl')
xtrain, xvalid, ytrain, yvalid = train_test_split(doc_embeddings, y, random_state=42, test_size=0.2)

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)
preds_valid = lreg.predict(xvalid)
print(classification_report(yvalid,preds_valid))

rf=RandomForestClassifier()
rf.fit(xtrain, ytrain)
preds=rf.predict(xvalid)
print(classification_report(yvalid,preds))

xgb=XGBClassifier()
xgb.fit(xtrain, ytrain)
preds=xgb.predict(xvalid)
print(classification_report(yvalid,preds))



              precision    recall  f1-score   support

           0       0.89      0.93      0.91       115
           1       0.90      0.85      0.87        85

   micro avg       0.90      0.90      0.90       200
   macro avg       0.90      0.89      0.89       200
weighted avg       0.90      0.90      0.89       200





              precision    recall  f1-score   support

           0       0.82      0.88      0.85       115
           1       0.82      0.74      0.78        85

   micro avg       0.82      0.82      0.82       200
   macro avg       0.82      0.81      0.81       200
weighted avg       0.82      0.82      0.82       200

              precision    recall  f1-score   support

           0       0.88      0.89      0.88       115
           1       0.85      0.84      0.84        85

   micro avg       0.86      0.86      0.86       200
   macro avg       0.86      0.86      0.86       200
weighted avg       0.86      0.86      0.86       200



#### Document Embedding using average of Token Embeddings (formed by concatenating last 4 layers)

In [None]:
doc_token_embeddings=[]
ctr=0
for t in np.array(tokenized_indexed_texts):
    ctr+=1
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([t])
    segments_tensors = torch.tensor([[1]*len(t)])
    # Predict hidden states features for each layer
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
    token_embeddings = [] 
    # For each token in the sentence...
    for token_i in range(len(t)):
        # Holds 12 layers of hidden states for each token 
        hidden_layers = [] 
        # For each of the 12 layers...
        for layer_i in range(len(encoded_layers)):
            # Lookup the vector for `token_i` in `layer_i`
            vec = encoded_layers[layer_i][batch_i][token_i]
            hidden_layers.append(vec)
        token_embeddings.append(hidden_layers)
    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs_cat = []
    # For each token in the sentence...
    for token in token_embeddings:
        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), 0)
        # Use `cat_vec` to represent `token`.
        token_vecs_cat.append(np.array([cat_vec.tolist()]))
    doc_token_embeddings.append(np.concatenate(token_vecs_cat).mean(axis=0))
    if ctr%30==0:
        print(ctr)
saveData('DocTokenWOPadBert.pkl',np.array(doc_token_embeddings))

In [81]:
doc_token_embeddings=loadData('DocTokenWOPadBert.pkl')
xtrain, xvalid, ytrain, yvalid = train_test_split(doc_token_embeddings, y, random_state=42, test_size=0.2)

lreg = LogisticRegression()
lreg.fit(xtrain, ytrain)
preds_valid = lreg.predict(xvalid)
print(classification_report(yvalid,preds_valid))


rf=RandomForestClassifier()
rf.fit(xtrain, ytrain)
preds=rf.predict(xvalid)
print(classification_report(yvalid,preds))

xgb=XGBClassifier()
xgb.fit(xtrain, ytrain)
preds=xgb.predict(xvalid)
print(classification_report(yvalid,preds))



              precision    recall  f1-score   support

           0       0.89      0.93      0.91       115
           1       0.90      0.85      0.87        85

   micro avg       0.90      0.90      0.90       200
   macro avg       0.90      0.89      0.89       200
weighted avg       0.90      0.90      0.89       200





              precision    recall  f1-score   support

           0       0.81      0.86      0.84       115
           1       0.79      0.73      0.76        85

   micro avg       0.81      0.81      0.81       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.81      0.80       200

              precision    recall  f1-score   support

           0       0.92      0.90      0.91       115
           1       0.86      0.89      0.88        85

   micro avg       0.90      0.90      0.90       200
   macro avg       0.89      0.89      0.89       200
weighted avg       0.90      0.90      0.90       200

