In [1]:
!pip install contractions
!pip install textsearch
!pip install tqdm
import nltk
nltk.download('punkt')

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/f5/2a/ba0a3812e2a1de2cc4ee0ded0bdb750a7cef1631c13c78a4fc4ab042adec/contractions-0.0.21-py2.py3-none-any.whl
Installing collected packages: contractions
Successfully installed contractions-0.0.21
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting Unidecode (from textsearch)
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 4.1MB/s 
[?25hCollecting pyahocorasick (from textsearch)
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 52.1MB/s 
[?25h

True

# Load and View Dataset

In [2]:
import pandas as pd
train_df=pd.read_csv('/content/review_train.csv')
test_df=pd.read_csv('/content/review_test.csv')
train_df["source"] = "train"
test_df["source"] = "test"
dataset = pd.concat([train_df,test_df])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18532 entries, 0 to 3706
Data columns (total 4 columns):
Text         18532 non-null object
Score        18532 non-null int64
Sentiment    18532 non-null int64
source       18532 non-null object
dtypes: int64(2), object(2)
memory usage: 723.9+ KB


In [3]:
dataset.head()

Unnamed: 0,Text,Score,Sentiment,source
0,"I got a free sample of these once, and now--we...",5,1,train
1,I used to get this Tea when I lived in Washing...,4,1,train
2,This is my all time favorite 'grab and go' sna...,5,1,train
3,This flavor is very good and unexpected. The ...,4,1,train
4,thrilled to have this assortment as i got the ...,4,1,train


# Build Train and Test Datasets

In [0]:
# build train and test datasets
texts = dataset['Text'].values
sentiments = dataset['Sentiment'].values
scores = dataset['Score'].values


train_texts = texts[:17000]
train_sentiments = sentiments[:17000]
train_scores = scores[:17000]


test_texts = texts[1532:]
test_sentiments = sentiments[1532:]
test_scores = scores[1532:]

# Text Wrangling & Normalization

In [0]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [23]:
%%time

norm_train_reviews = pre_process_corpus(train_texts)
norm_test_reviews = pre_process_corpus(test_texts)

100%|██████████| 17000/17000 [00:04<00:00, 3836.83it/s]
100%|██████████| 17000/17000 [00:04<00:00, 3822.97it/s]

CPU times: user 8.76 s, sys: 99.7 ms, total: 8.86 s
Wall time: 8.89 s





In [24]:
norm_test_reviews

['i have 3 subscriptions to 3 different kinds of coffeei like them allthe coffee is always fresh andwith amazon shipping for freea bargain',
 'she my cat is happy with this stuff and the price was good for my budget so that makes two of us',
 'if you are a dirty martini drinker then this is the best olive juice to use we like them really cold and really dirtylove the dirty sue well packed and always arrives in good time',
 'for years the only lus i could find at my local market were the usa lus which are not as good as the belgium biscuits so i was happy to find them at amazon this past july 2012 and bought the case of six boxes unfortunately they did not taste fresh and had an expiration date of december 2012 that explained the stale taste but i chose not to return them instead i nuke two in the microwave for 5 or 6 seconds and it sort of revives the flavorwell last week sept 14 2012 i found the belgium lu at my local supermarket and bought a single box that box expires december 2013 

# Traditional Supervised Machine Learning Models

## Feature Engineering

In [25]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

CPU times: user 6.92 s, sys: 58.1 ms, total: 6.98 s
Wall time: 6.99 s


In [26]:
%%time

# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

CPU times: user 4.25 s, sys: 4.1 ms, total: 4.25 s
Wall time: 4.27 s


In [27]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (17000, 41090)  Test features shape: (17000, 41090)
TFIDF model:> Train features shape: (17000, 41090)  Test features shape: (17000, 41090)


## Model Training, Prediction and Performance Evaluation

### Try out Logistic Regression

The logistic regression model is actually a statistical model developed by statistician
David Cox in 1958. It is also known as the logit or logistic model since it uses the
logistic (popularly also known as sigmoid) mathematical function to estimate the
parameter values. These are the coefficients of all our features such that the overall loss
is minimized when predicting the outcome—

In [28]:
%%time

# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, train_sentiments)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

CPU times: user 5.57 s, sys: 3.93 s, total: 9.5 s
Wall time: 4.85 s


In [29]:
from sklearn.metrics import confusion_matrix, classification_report

labels = ['negative', 'positive']
print(classification_report(test_sentiments, lr_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2652
           1       0.99      1.00      1.00     14348

    accuracy                           0.99     17000
   macro avg       0.99      0.98      0.99     17000
weighted avg       0.99      0.99      0.99     17000



Unnamed: 0,negative,positive
negative,2564,88
positive,32,14316


In [30]:
%%time

# Logistic Regression model on TF-IDF features

# train model
lr.fit(tv_train_features, train_sentiments)

# predict on test data
lr_tfidf_predictions = lr.predict(tv_test_features)

CPU times: user 845 ms, sys: 567 ms, total: 1.41 s
Wall time: 738 ms


In [31]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, lr_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.97      0.52      0.68      2652
           1       0.92      1.00      0.96     14348

    accuracy                           0.92     17000
   macro avg       0.95      0.76      0.82     17000
weighted avg       0.93      0.92      0.91     17000



Unnamed: 0,negative,positive
negative,1389,1263
positive,41,14307


### Try out Random Forest

Decision trees are a family of supervised machine learning algorithms that can represent
and interpret sets of rules automatically from the underlying data. They use metrics like
information gain and gini-index to build the tree. However, a major drawback of decision
trees is that since they are non-parametric, the more data there is, greater the depth of
the tree. We can end up with really huge and deep trees that are prone to overfitting. The
model might work really well on training data, but instead of learning, it just memorizes
all the training samples and builds very specific rules to them. Hence, it performs really
poorly on the test data. Random forests try to tackle this problem.

A random forest is a meta-estimator or an ensemble model that fits a number of
decision tree classifiers on various sub-samples of the dataset and uses averaging to
improve the predictive accuracy and control over-fitting. The sub-sample size is always
the same as the original input sample size, but the samples are drawn with replacement
(bootstrap samples). In random forests, all the trees are trained in parallel (bagging
model/bootstrap aggregation). Besides this, each tree in the ensemble is built from a
sample drawn with replacement (i.e., a bootstrap sample) from the training set. Also,
when splitting a node during the construction of the tree, the split that is chosen is no
longer the best split among all features. Instead, the split that is picked is the best split
among a random subset of the features. T

In [32]:
%%time

# Random Forest model on BOW features
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# train model
rf.fit(cv_train_features, train_sentiments)

# predict on test data
rf_bow_predictions = rf.predict(cv_test_features)

CPU times: user 32.8 s, sys: 42.6 ms, total: 32.9 s
Wall time: 17 s


In [33]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, rf_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      2652
           1       0.99      1.00      0.99     14348

    accuracy                           0.99     17000
   macro avg       0.99      0.96      0.98     17000
weighted avg       0.99      0.99      0.99     17000



Unnamed: 0,negative,positive
negative,2445,207
positive,1,14347


In [34]:
%%time

# Random Forest model on TF-IDF features

# train model
rf.fit(tv_train_features, train_sentiments)

# predict on test data
rf_tfidf_predictions = rf.predict(tv_test_features)

CPU times: user 34.8 s, sys: 51.3 ms, total: 34.9 s
Wall time: 18 s


In [36]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, rf_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       1.00      0.92      0.96      2652
           1       0.99      1.00      0.99     14348

    accuracy                           0.99     17000
   macro avg       0.99      0.96      0.98     17000
weighted avg       0.99      0.99      0.99     17000



Unnamed: 0,negative,positive
negative,2447,205
positive,0,14348


# Newer Supervised Deep Learning Models

In [0]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

## Prediction class label encoding

In [0]:
le = LabelEncoder()
# tokenize train reviews & encode train labels
tokenized_train = [nltk.word_tokenize(text)
                       for text in norm_train_reviews]
y_train = le.fit_transform(train_sentiments)
# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text)
                       for text in norm_test_reviews]
y_test = le.fit_transform(test_sentiments)

In [39]:
# print class label encoding map and encoded labels
print('Sentiment class label map:', dict(zip(le.classes_, le.transform(le.classes_))))
print('Sample test label transformation:\n'+'-'*35,
      '\nActual Labels:', test_sentiments[:3], '\nEncoded Labels:', y_test[:3])

Sentiment class label map: {0: 0, 1: 1}
Sample test label transformation:
----------------------------------- 
Actual Labels: [1 1 1] 
Encoded Labels: [1 1 1]


## Feature Engineering with word embeddings

In [0]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [41]:
%%time
# build word2vec model
w2v_num_features = 300
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features, window=150,
                                   min_count=10, workers=4, iter=5)    

2019-09-16 14:07:43,035 : INFO : collecting all words and their counts
2019-09-16 14:07:43,036 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-09-16 14:07:43,201 : INFO : PROGRESS: at sentence #10000, processed 786775 words, keeping 29049 word types
2019-09-16 14:07:43,314 : INFO : collected 39884 word types from a corpus of 1339532 raw words and 17000 sentences
2019-09-16 14:07:43,316 : INFO : Loading a fresh vocabulary
2019-09-16 14:07:43,355 : INFO : effective_min_count=10 retains 5534 unique words (13% of original 39884, drops 34350)
2019-09-16 14:07:43,357 : INFO : effective_min_count=10 leaves 1275473 word corpus (95% of original 1339532, drops 64059)
2019-09-16 14:07:43,379 : INFO : deleting the raw counts dictionary of 39884 items
2019-09-16 14:07:43,381 : INFO : sample=0.001 downsamples 57 most-common words
2019-09-16 14:07:43,382 : INFO : downsampling leaves estimated 912293 word corpus (71.5% of prior 1275473)
2019-09-16 14:07:43,399 : INFO : 

CPU times: user 1min 25s, sys: 160 ms, total: 1min 25s
Wall time: 43.9 s


In [0]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [0]:
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

In [44]:
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)

Word2Vec model:> Train features shape: (17000, 300)  Test features shape: (17000, 300)


## Modeling with deep neural networks 

### Building Deep neural network architecture

In [0]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,)))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256))
    dnn_model.add(Activation('relu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(1))
    dnn_model.add(Activation('sigmoid'))

    dnn_model.compile(loss='binary_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

In [46]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


### Visualize sample deep architecture

In [47]:
w2v_dnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               154112    
_________________________________________________________________
activation (Activation)      (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               6

### Model Training, Prediction and Performance Evaluation

In [48]:
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, y_train, epochs=10, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Train on 15300 samples, validate on 1700 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fa4f8c01cc0>

In [49]:
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)
predictions = le.inverse_transform(y_pred) 

  y = column_or_1d(y, warn=True)


In [51]:
labels = ['negative', 'positive']
print(classification_report(test_sentiments, predictions))
pd.DataFrame(confusion_matrix(test_sentiments, predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

           0       0.76      0.57      0.65      2652
           1       0.92      0.97      0.94     14348

    accuracy                           0.90     17000
   macro avg       0.84      0.77      0.80     17000
weighted avg       0.90      0.90      0.90     17000



Unnamed: 0,negative,positive
negative,1511,1141
positive,482,13866
