In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy 

In [3]:
print(np.__version__)


1.25.2


In [2]:
df=pd.read_csv("./reviews_badminton/data.csv")

In [3]:
df.shape

(8518, 8)

In [4]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a cheapest.... Sad to hear this.READ MORE,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in new box. It's not a original yonex product. Don't buy.flipkart platform is chosen to fraud the buyers.READ MORE,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the corks like not as before 3 to 5 years back.. I am using MAVIS 350 for more than 15 years quality of corks was very very good at that times, but now I am not getting the quality corks as like before, rate of corks also too much now, I am very sorry to say like this, but in my experience , my Statment is very true to my knowledgeREAD MORE",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn't understand.. Wat is d advantage of buying dis frm flipkrtREAD MORE,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8518 entries, 0 to 8517
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Reviewer Name    8508 non-null   object 
 1   Review Title     8508 non-null   object 
 2   Place of Review  8468 non-null   object 
 3   Up Votes         8508 non-null   float64
 4   Down Votes       8508 non-null   float64
 5   Month            8053 non-null   object 
 6   Review text      8510 non-null   object 
 7   Ratings          8518 non-null   int64  
dtypes: float64(2), int64(1), object(5)
memory usage: 532.5+ KB


In [6]:
df.describe()

Unnamed: 0,Up Votes,Down Votes,Ratings
count,8508.0,8508.0,8518.0
mean,0.391396,0.121768,4.181028
std,11.613909,3.248022,1.2622
min,0.0,0.0,1.0
25%,0.0,0.0,4.0
50%,0.0,0.0,5.0
75%,0.0,0.0,5.0
max,889.0,219.0,5.0


In [7]:
df.isna().sum()

Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

### Fill  the null values 

In [8]:
df['Up Votes'].fillna(df['Up Votes'].mean(), inplace=True)
df['Down Votes'].fillna(df['Down Votes'].mean(), inplace=True)

In [9]:
df['Reviewer Name'].fillna(df['Reviewer Name'].mode()[0], inplace=True)
df['Review Title'].fillna(df['Review Title'].mode()[0], inplace=True)
df['Place of Review'].fillna(df['Place of Review'].mode()[0], inplace=True)
df['Month'].fillna(df['Month'].mode()[0], inplace=True)


In [10]:
# For the 'Review text' column, you can fill the null values with a string indicating no review was given
df['Review text'].fillna('No review text given', inplace=True)

In [11]:
df.isna().sum()

Reviewer Name      0
Review Title       0
Place of Review    0
Up Votes           0
Down Votes         0
Month              0
Review text        0
Ratings            0
dtype: int64

## Labeling the Data

In [12]:
# we have to create a label named as Sentiment with the help of rating column 
def label_data(rating):
    if rating >= 3:
        return 'positive'
    else:
        return 'negative'

df['Sentiment'] = df['Ratings'].apply(label_data)

In [13]:
df["Sentiment"].value_counts()

positive    7441
negative    1077
Name: Sentiment, dtype: int64

### Mapping of categorical values 

In [14]:
mapping = {'positive': 1, 'negative': 0}
df['Sentiment_num'] = df['Sentiment'].map(mapping)

In [15]:
df.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings,Sentiment,Sentiment_num
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now rising which is a bad sign. 800-850 was an affordable price, especially when we play everyday. So kindly help us out in terms of the price. Thank You.READ MORE",4,positive,1
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside cover was Yonex Ad inside was a cheapest.... Sad to hear this.READ MORE,1,negative,0
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in new box. It's not a original yonex product. Don't buy.flipkart platform is chosen to fraud the buyers.READ MORE,1,negative,0
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,Oct 2020,"Quite O. K. , but nowadays the quality of the corks like not as before 3 to 5 years back.. I am using MAVIS 350 for more than 15 years quality of corks was very very good at that times, but now I am not getting the quality corks as like before, rate of corks also too much now, I am very sorry to say like this, but in my experience , my Statment is very true to my knowledgeREAD MORE",3,positive,1
4,ASHIK P A,Over priced,"Certified Buyer, Bengaluru",147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn't understand.. Wat is d advantage of buying dis frm flipkrtREAD MORE,1,negative,0


### Splitting the data

In [16]:
X=df["Review text"]
y=df["Sentiment_num"]

In [17]:
X.head()

0    Nice product, good quality, but price is now r...
1    They didn't supplied Yonex Mavis 350. Outside ...
2    Worst product. Damaged shuttlecocks packed in ...
3    Quite O. K. , but nowadays  the quality of the...
4    Over pricedJust â?¹620 ..from retailer.I didn'...
Name: Review text, dtype: object

In [18]:
y.head()

0    1
1    0
2    0
3    1
4    0
Name: Sentiment_num, dtype: int64

In [19]:
# Splitting into train and test

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
X_train.shape, X_test.shape,y_train.shape,y_test.shape

((6814,), (1704,), (6814,), (1704,))

### Importing necessary libraries

In [21]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

###  Cleaning of the text data

In [22]:
# 1. Text Cleaning
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) 
    
    # Substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # Removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # Converting to Lowercase
    text = text.lower()
    
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

X_train= X_train.apply(clean_text)

# 2. Text Normalization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text]
    text = ' '.join(text)
    return text

X_train = X_train.apply(lemmatize_text)

In [23]:
X_train.head()

6289                                     nice shuttleread
549     product good received stock 3 4 month befor ma...
4707    excellent service got one day even remote loca...
764                                  good high price read
6861                    2 damaged shuttle 6 satisfiedread
Name: Review text, dtype: object

In [24]:
X_test.head()

7497    Fast deliveryOriginal productReasonable priceR...
5257               working well in out door gameREAD MORE
2571    Its value for money and most importantly it's ...
1084                                    HorribleREAD MORE
856                                         GoogREAD MORE
Name: Review text, dtype: object

## Numerical Feature Extraction
### Bag of Words

In [25]:
vectorizer = CountVectorizer(preprocessor=clean_text, max_features=5000)
X_train_bow = vectorizer.fit_transform(X_train)

In [26]:
print("Total unique words:", len(vectorizer.vocabulary_))

print("Type of train features:", type(X_train_bow))

print("Shape of input data:", X_train_bow.shape)

Total unique words: 2782
Type of train features: <class 'scipy.sparse._csr.csr_matrix'>
Shape of input data: (6814, 2782)


In [27]:
X_train_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
from sys import getsizeof

print(type(X_train_bow))
print(getsizeof(X_train_bow), "Bytes")

<class 'scipy.sparse._csr.csr_matrix'>
48 Bytes


In [29]:
from sys import getsizeof

print(type(X_train_bow.toarray()))
print(getsizeof(X_train_bow.toarray()), "Bytes")

<class 'numpy.ndarray'>
151652512 Bytes


### Preprocess the X_test data 

In [30]:
X_test.head()

7497    Fast deliveryOriginal productReasonable priceR...
5257               working well in out door gameREAD MORE
2571    Its value for money and most importantly it's ...
1084                                    HorribleREAD MORE
856                                         GoogREAD MORE
Name: Review text, dtype: object

In [31]:
X_test = X_test.apply(clean_text)

X_test = X_test.apply(lemmatize_text)

In [32]:
X_test.head()

7497    fast deliveryoriginal productreasonable priceread
5257                           working well door gameread
2571                 value money importantly originalread
1084                                         horribleread
856                                              googread
Name: Review text, dtype: object

In [33]:
X_test_bow = vectorizer.transform(X_test)

In [34]:
X_test_bow

<1704x2782 sparse matrix of type '<class 'numpy.int64'>'
	with 5564 stored elements in Compressed Sparse Row format>

## Logistic Regression 

In [35]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_bow, y_train)

In [36]:
y_test_pred = classifier.predict(X_test_bow)

In [37]:
from sklearn.metrics import accuracy_score, classification_report

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.9219483568075117
              precision    recall  f1-score   support

           0       0.79      0.45      0.58       199
           1       0.93      0.98      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.86      0.72      0.77      1704
weighted avg       0.91      0.92      0.91      1704



## Naive Bayes

In [38]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_bow, y_train)

In [39]:
y_test_pred = nb.predict(X_test_bow)

In [40]:
print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.92018779342723
              precision    recall  f1-score   support

           0       0.74      0.48      0.59       199
           1       0.93      0.98      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.84      0.73      0.77      1704
weighted avg       0.91      0.92      0.91      1704



## SVM

In [41]:
from sklearn import svm

svc=svm.SVC()
svc.fit(X_train_bow,y_train)

In [42]:
y_test_pred=svc.predict(X_test_bow)

In [43]:
y_test[:5]

7497    1
5257    1
2571    1
1084    1
856     1
Name: Sentiment_num, dtype: int64

In [44]:
y_test_pred[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [45]:
print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.909037558685446
              precision    recall  f1-score   support

           0       0.78      0.31      0.44       199
           1       0.92      0.99      0.95      1505

    accuracy                           0.91      1704
   macro avg       0.85      0.65      0.70      1704
weighted avg       0.90      0.91      0.89      1704



In [46]:
import joblib
from joblib import Memory

import os
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
import warnings
warnings.filterwarnings('ignore')

In [47]:
X_train.shape,y_train.shape

((6814,), (6814,))

In [48]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'decision_tree': Pipeline([
        ('vectorization', CountVectorizer()),
        ('classifier', DecisionTreeClassifier())
    ], memory=memory),
    'svc': Pipeline([
    ('vectorization', CountVectorizer()),
    ('classifier', SVC(kernel='linear')),
    ],memory=memory)
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__alpha' : [1, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ],
    'decision_tree': [
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000],
            'classifier__max_depth': [None, 5, 10]
        }
    ],
    'svc':[
        {
            'vectorization': [CountVectorizer()],
            'vectorization__max_df': [0.5, 0.75, 1.0],
            'vectorization__ngram_range': [(1, 1), (1, 2), (2, 2)],
            'classifier__C': [0.1, 1, 10],
        }
    ]
    
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(X_train, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test, y_test))

********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: total: 5.73 s
Wall time: 6.7 s
Score on Test Data:  0.92018779342723
********** logistic_regression **********
Fitting 5 folds for each of 36 candidates, totalling 180 fits
CPU times: total: 7min 26s
Wall time: 7min 28s
Score on Test Data:  0.9078638497652582
********** decision_tree **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 23.3 s
Wall time: 23.9 s
Score on Test Data:  0.9125586854460094
********** svc **********
Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU times: total: 3min 27s
Wall time: 3min 30s
Score on Test Data:  0.9213615023474179


In [49]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    
    joblib.dump(model, f'best_models/{name}.pkl')
    model = joblib.load(f'best_models/{name}.pkl')
    
    %time y_test_pred = model.predict(X_test)
    print("Accuracy Score", metrics.accuracy_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'best_models/{name}.pkl'), "Bytes")

********** naive_bayes **********
CPU times: total: 15.6 ms
Wall time: 25.1 ms
Accuracy Score 0.92018779342723
Model Size: 160918 Bytes
********** logistic_regression **********
CPU times: total: 15.6 ms
Wall time: 15.6 ms
Accuracy Score 0.9078638497652582
Model Size: 94217 Bytes
********** decision_tree **********
CPU times: total: 15.6 ms
Wall time: 15.5 ms
Accuracy Score 0.9125586854460094
Model Size: 83800 Bytes
********** svc **********
CPU times: total: 156 ms
Wall time: 156 ms
Accuracy Score 0.9213615023474179
Model Size: 180581 Bytes


### TF-IDF

In [50]:
tfidfconverter = TfidfVectorizer()
X_train_idf = tfidfconverter.fit_transform(X_train).toarray()

In [51]:
X_train_idf[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [52]:
X_test_idf=tfidfconverter.transform(X_test).toarray()

### Logistic Regression

In [53]:
classifier.fit(X_train_idf,y_train)

In [54]:
y_pred_idf=classifier.predict(X_test_idf)

In [55]:
print(accuracy_score(y_test, y_pred_idf))

print(classification_report(y_test, y_pred_idf))

0.9184272300469484
              precision    recall  f1-score   support

           0       0.81      0.40      0.53       199
           1       0.93      0.99      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.87      0.69      0.74      1704
weighted avg       0.91      0.92      0.91      1704



### SVM

In [56]:
svc.fit(X_train_idf,y_train)
y_pred_idf=svc.predict(X_test_idf)

print(accuracy_score(y_test, y_pred_idf))

print(classification_report(y_test, y_pred_idf))

0.9184272300469484
              precision    recall  f1-score   support

           0       0.77      0.43      0.55       199
           1       0.93      0.98      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.85      0.71      0.75      1704
weighted avg       0.91      0.92      0.91      1704



### Naive bayes

In [57]:
nb.fit(X_train_idf, y_train)

y_pred_idf=nb.predict(X_test_idf)

print(accuracy_score(y_test, y_pred_idf))

print(classification_report(y_test, y_pred_idf))

0.9019953051643192
              precision    recall  f1-score   support

           0       0.88      0.19      0.31       199
           1       0.90      1.00      0.95      1505

    accuracy                           0.90      1704
   macro avg       0.89      0.59      0.63      1704
weighted avg       0.90      0.90      0.87      1704



In [58]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization',  TfidfVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    
    'logistic_regression': Pipeline([
        ('vectorization',  TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    
    'svc': Pipeline([
    ('vectorization',  TfidfVectorizer()),
    ('classifier', SVC(kernel='linear')),
    ],memory=memory)
}

# Define parameter grid for each algorithm
param_grids = {
    'naive_bayes': [
        {
            'vectorization': [ TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__alpha' : [1, 10]
        }
    ],
    'logistic_regression': [
        {
            'vectorization': [ TfidfVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['elasticnet'], 
            'classifier__l1_ratio': [0.4, 0.5, 0.6],
            'classifier__solver': ['saga'],
            'classifier__class_weight': ['balanced']
        }
    ],
    'svc':[
        {
            'vectorization': [ TfidfVectorizer()],
            'vectorization__max_df': [0.5, 0.75, 1.0],
            'vectorization__ngram_range': [(1, 1), (1, 2), (2, 2)],
            'classifier__C': [0.1, 1, 10],
        }
    ]
    
}

# Perform GridSearchCV for each algorithm
best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(X_train, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test, y_test))

********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: total: 5.81 s
Wall time: 6.87 s
Score on Test Data:  0.9160798122065728
********** logistic_regression **********
Fitting 5 folds for each of 36 candidates, totalling 180 fits
CPU times: total: 5min 37s
Wall time: 5min 38s
Score on Test Data:  0.892018779342723
********** svc **********
Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU times: total: 3min 29s
Wall time: 3min 33s
Score on Test Data:  0.920774647887324


In [59]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    model_name=f"{name}_tfidf"
    joblib.dump(model, f'best_models/{model_name}.pkl')
    model = joblib.load(f'best_models/{model_name}.pkl')
    
    %time y_test_pred = model.predict(X_test)
    print("Accuracy Score", metrics.accuracy_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'best_models/{model_name}.pkl'), "Bytes")

********** naive_bayes **********
CPU times: total: 15.6 ms
Wall time: 15.6 ms
Accuracy Score 0.9160798122065728
Model Size: 99578 Bytes
********** logistic_regression **********
CPU times: total: 15.6 ms
Wall time: 15.6 ms
Accuracy Score 0.892018779342723
Model Size: 139133 Bytes
********** svc **********
CPU times: total: 281 ms
Wall time: 281 ms
Accuracy Score 0.920774647887324
Model Size: 819657 Bytes


## Converting Text to Numerical vectors - Word2Vec Representation

Step 1 - Import Word2Vec module from gensim.models


Step 2 - Convert the sentences to the List of Words (i.e. List of Tokens)


Step 3 - Use Word2Vec to learn numerical vectors for each unique words. Word2Vec uses the list of tokens and generate 300Dimensional numerical vector for each unique word.


Step 4 - Convert the word vectors to document vectors.

In [60]:
import gensim 
print(gensim.__version__)

4.3.2


In [61]:
from gensim.models import Word2Vec

In [62]:
X_train.shape

(6814,)

In [63]:
X_train.head()

6289                                     nice shuttleread
549     product good received stock 3 4 month befor ma...
4707    excellent service got one day even remote loca...
764                                  good high price read
6861                    2 damaged shuttle 6 satisfiedread
Name: Review text, dtype: object

In [64]:
# train model
X_train_tokenised_sentences = X_train.apply(lambda sent : sent.split())
model = Word2Vec(list(X_train), vector_size=300, min_count=1)

In [65]:
# Checking the shape of vectors learned by the model

print(model.wv.__getitem__(model.wv.index_to_key).shape)

(48, 300)


In [68]:
def document_vec(doc, keyed_vectors):
    """Remove out-of-vocabulary words. Create document vectors by averaging word vectors."""
    vocab_tokens = [word for word in doc if word in keyed_vectors.index_to_key]
    if len(vocab_tokens) == 0:
        # Return a vector of zeros if no words are in the vocabulary
        return np.zeros(keyed_vectors.vector_size)
    else:
        return np.mean(keyed_vectors.__getitem__(vocab_tokens), axis=0)

In [69]:
from tqdm import tqdm
tqdm.pandas()

X_train_doc_vector= X_train_tokenised_sentences.progress_apply(lambda x : document_vec(x, model.wv))


100%|███████████████████████████████████████████████████████████████████████████| 6814/6814 [00:00<00:00, 11640.62it/s]


In [70]:
X_train_w2v = list(X_train_doc_vector)

In [71]:
from sklearn.preprocessing import MinMaxScaler

def scaling(X):
    scaler = MinMaxScaler()
    return(scaler.fit_transform(X))

In [72]:
X_train_w2v=scaling(X_train_w2v)

In [73]:
X_test_tokenised_sentences = X_test.apply(lambda sent : sent.split())

X_test.head()

7497    fast deliveryoriginal productreasonable priceread
5257                           working well door gameread
2571                 value money importantly originalread
1084                                         horribleread
856                                              googread
Name: Review text, dtype: object

In [75]:
X_test_doc_vector = X_test_tokenised_sentences.progress_apply(lambda x : document_vec(x, model.wv))

100%|███████████████████████████████████████████████████████████████████████████| 1704/1704 [00:00<00:00, 46071.05it/s]


In [76]:
X_test_w2v = list(X_test_doc_vector)

In [77]:
X_test_w2v=scaling(X_test_w2v)

### Logistic Regression

In [78]:
classifier.fit(X_train_w2v,y_train)

y_pred_w2v=classifier.predict(X_test_w2v)

print(accuracy_score(y_test, y_pred_w2v))

print(classification_report(y_test, y_pred_w2v))

0.8832159624413145
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       199
           1       0.88      1.00      0.94      1505

    accuracy                           0.88      1704
   macro avg       0.44      0.50      0.47      1704
weighted avg       0.78      0.88      0.83      1704



### Random Forest 

In [79]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier()

clf.fit(X_train_w2v,y_train)

y_pred_w2v=clf.predict(X_test_w2v)

print(accuracy_score(y_test, y_pred_w2v))

print(classification_report(y_test, y_pred_w2v))

0.8802816901408451
              precision    recall  f1-score   support

           0       0.31      0.02      0.04       199
           1       0.88      0.99      0.94      1505

    accuracy                           0.88      1704
   macro avg       0.60      0.51      0.49      1704
weighted avg       0.82      0.88      0.83      1704



## SVC

In [80]:
svc.fit(X_train_w2v,y_train)
y_pred_w2v=svc.predict(X_test_w2v)

print(accuracy_score(y_test, y_pred_w2v))

print(classification_report(y_test, y_pred_w2v))

0.8832159624413145
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       199
           1       0.88      1.00      0.94      1505

    accuracy                           0.88      1704
   macro avg       0.44      0.50      0.47      1704
weighted avg       0.78      0.88      0.83      1704



In [81]:
X_train.shape

(6814,)

In [82]:
y_train.shape

(6814,)

## Pipeline Creation

In [84]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
import numpy as np


class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, size=100, window=5, min_count=1, workers=4):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None

    def fit(self, X, y=None):
        tokenised_sentences = [sent.split() for sent in X]
        self.model = Word2Vec(tokenised_sentences, vector_size=self.size, window=self.window, min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X):
        tokenised_sentences = [sent.split() for sent in X]
        return np.array([self.document_vector(words) for words in tokenised_sentences])

    def document_vector(self, words):
        vocab_tokens = [word for word in words if word in self.model.wv.index_to_key]
        if len(vocab_tokens) == 0:
            return np.zeros(self.model.vector_size)
        else:
            return np.mean(self.model.wv.__getitem__(vocab_tokens), axis=0)


In [89]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    'logistic_regression': Pipeline([
        ('vectorization',  Word2VecVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    

    
    'svc': Pipeline([
        ('vectorization',  Word2VecVectorizer()),
        ('classifier', SVC())
    ], memory=memory)
}

param_grids = {
    'logistic_regression': [
        {
            'vectorization__size': [100, 200],
            'vectorization__window': [5, 10],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2'], 
            'classifier__solver': ['lbfgs'],
        }
    ],

    
    'svc':[
        {
            'vectorization__size': [100,200],
            'vectorization__window': [5, 10],
            'classifier__C': [0.1, 1, 10],
        }
    ]
}

best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(X_train, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test, y_test))



********** logistic_regression **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 1min 51s
Wall time: 1min 45s
Score on Test Data:  0.8832159624413145
********** svc **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 9min 25s
Wall time: 11min 23s
Score on Test Data:  0.8832159624413145


In [90]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    model_name=f"{name}_w2v"
    joblib.dump(model, f'best_models/{model_name}.pkl')
    model = joblib.load(f'best_models/{model_name}.pkl')
    
    %time y_test_pred = model.predict(X_test)
    print("Accuracy Score", metrics.accuracy_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'best_models/{model_name}.pkl'), "Bytes")

********** logistic_regression **********
CPU times: total: 250 ms
Wall time: 425 ms
Accuracy Score 0.8832159624413145
Model Size: 2341786 Bytes
********** svc **********
CPU times: total: 1.03 s
Wall time: 1.08 s
Accuracy Score 0.8832159624413145
Model Size: 3817702 Bytes


### GloVe Method 

In [91]:
import gensim.downloader as api

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [92]:
wv=api.load("glove-twitter-50")

In [93]:
X_train.head()

6289                                     nice shuttleread
549     product good received stock 3 4 month befor ma...
4707    excellent service got one day even remote loca...
764                                  good high price read
6861                    2 damaged shuttle 6 satisfiedread
Name: Review text, dtype: object

In [95]:
X_train_pretrained_glove=X_train_tokenised_sentences.progress_apply(lambda x : document_vec(x, wv))

100%|██████████████████████████████████████████████████████████████████████████████| 6814/6814 [05:14<00:00, 21.70it/s]


In [96]:
X_train_glove = list(X_train_pretrained_glove)

In [97]:
X_train_glove=scaling(X_train_glove)

In [98]:
X_test_pretrained_glove = X_test_tokenised_sentences.progress_apply(lambda x : document_vec(x, wv))

100%|██████████████████████████████████████████████████████████████████████████████| 1704/1704 [01:21<00:00, 20.97it/s]


In [99]:
X_test_glove = list(X_test_pretrained_glove)

In [100]:
X_test_glove=scaling(X_test_glove)

### Logistic Regression

In [101]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_glove, y_train)

y_test_pred = classifier.predict(X_test_glove)

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.9043427230046949
              precision    recall  f1-score   support

           0       0.63      0.43      0.51       199
           1       0.93      0.97      0.95      1505

    accuracy                           0.90      1704
   macro avg       0.78      0.70      0.73      1704
weighted avg       0.89      0.90      0.90      1704



### Naive Bayes

In [102]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_glove, y_train)

y_pred_glove = gnb.predict(X_test_glove)

print(accuracy_score(y_test, y_pred_glove))

print(classification_report(y_test, y_pred_glove))


0.585093896713615
              precision    recall  f1-score   support

           0       0.20      0.86      0.33       199
           1       0.97      0.55      0.70      1505

    accuracy                           0.59      1704
   macro avg       0.59      0.71      0.51      1704
weighted avg       0.88      0.59      0.66      1704



### SVM

In [103]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train_glove, y_train)

y_pred_glove = svc.predict(X_test_glove)

print(accuracy_score(y_test, y_pred_glove))

print(classification_report(y_test, y_pred_glove))


0.9131455399061033
              precision    recall  f1-score   support

           0       0.73      0.41      0.52       199
           1       0.93      0.98      0.95      1505

    accuracy                           0.91      1704
   macro avg       0.83      0.69      0.74      1704
weighted avg       0.90      0.91      0.90      1704



In [104]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

class GloVeVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, glove_file='glove.6B.100d.txt'):
        self.glove_file = glove_file
        self.word_vectors = self.load_glove_model()

    def load_glove_model(self):
        word2vec_output_file = self.glove_file + '.word2vec'
        glove2word2vec(self.glove_file, word2vec_output_file)
        return KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.document_vector(doc) for doc in X])

    def document_vector(self, doc):
        words = self.preprocess_text(doc)
        return np.mean([self.word_vectors[w] for w in words if w in self.word_vectors] or [np.zeros(self.word_vectors.vector_size)], axis=0)



In [105]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

pipelines = {
    
    'naive_bayes': Pipeline([
        ('vectorization',  Word2VecVectorizer()),
        ('classifier', MultinomialNB())
    ], memory=memory),
    
    'logistic_regression': Pipeline([
        ('vectorization',  Word2VecVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    
    'svc': Pipeline([
        ('vectorization',  Word2VecVectorizer()),
        ('classifier', SVC())
    ], memory=memory)
}

param_grids = {
     'naive_bayes': [
        {
            'vectorization': [ Word2VecVectorizer()],
            'vectorization__max_features' : [1000, 1500, 2000, 5000], 
            'classifier__alpha' : [1, 10]
        }
    ],
    
    'logistic_regression': [
        {
            'vectorization__size': [100, 200],
            'vectorization__window': [5, 10],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2'], 
            'classifier__solver': ['lbfgs'],
        }
    ],

    
    'svc':[
        {
            'vectorization__size': [100,200],
            'vectorization__window': [5, 10],
            'classifier__C': [0.1, 1, 10],
        }
    ]
}

best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(X_train, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test, y_test))



********** naive_bayes **********
Fitting 5 folds for each of 8 candidates, totalling 40 fits
CPU times: total: 11.6 s
Wall time: 14.2 s
Score on Test Data:  0.9160798122065728
********** logistic_regression **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 1min 44s
Wall time: 1min 29s
Score on Test Data:  0.8832159624413145
********** svc **********
Fitting 5 folds for each of 12 candidates, totalling 60 fits
CPU times: total: 8min 4s
Wall time: 9min 10s
Score on Test Data:  0.8832159624413145


In [106]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    model_name=f"{name}_glove"
    joblib.dump(model, f'best_models/{model_name}.pkl')
    model = joblib.load(f'best_models/{model_name}.pkl')
    
    %time y_test_pred = model.predict(X_test)
    print("Accuracy Score", metrics.accuracy_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'best_models/{model_name}.pkl'), "Bytes")

********** naive_bayes **********
CPU times: total: 62.5 ms
Wall time: 358 ms
Accuracy Score 0.9160798122065728
Model Size: 99578 Bytes
********** logistic_regression **********
CPU times: total: 375 ms
Wall time: 429 ms
Accuracy Score 0.8832159624413145
Model Size: 2341786 Bytes
********** svc **********
CPU times: total: 1.19 s
Wall time: 1.42 s
Accuracy Score 0.8832159624413145
Model Size: 3817814 Bytes


##  BERT for Sentence Vectors

In [107]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

In [108]:
X_train_pretrained_bert = X_train.progress_apply(model.encode)

X_train.head()

100%|██████████████████████████████████████████████████████████████████████████████| 6814/6814 [04:02<00:00, 28.08it/s]


6289                                     nice shuttleread
549     product good received stock 3 4 month befor ma...
4707    excellent service got one day even remote loca...
764                                  good high price read
6861                    2 damaged shuttle 6 satisfiedread
Name: Review text, dtype: object

In [109]:
X_train_bert_pretrained = list(X_train_pretrained_bert)

In [110]:
X_test_pretrained_bert = X_test.progress_apply(model.encode)

100%|██████████████████████████████████████████████████████████████████████████████| 1704/1704 [01:01<00:00, 27.76it/s]


In [112]:
X_test_bert_pretrained = list(X_test_pretrained_bert)

### Logistic Regression 

In [113]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

classifier = LogisticRegression()
classifier.fit(X_train_bert_pretrained, y_train)

y_test_pred = classifier.predict(X_test_bert_pretrained)

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))

0.9196009389671361
              precision    recall  f1-score   support

           0       0.77      0.44      0.56       199
           1       0.93      0.98      0.96      1505

    accuracy                           0.92      1704
   macro avg       0.85      0.71      0.76      1704
weighted avg       0.91      0.92      0.91      1704



### SVM

In [114]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train_bert_pretrained, y_train)

y_test_pred = svc.predict(X_test_bert_pretrained)

print(accuracy_score(y_test, y_test_pred))

print(classification_report(y_test, y_test_pred))


0.9272300469483568
              precision    recall  f1-score   support

           0       0.82      0.49      0.61       199
           1       0.94      0.99      0.96      1505

    accuracy                           0.93      1704
   macro avg       0.88      0.74      0.78      1704
weighted avg       0.92      0.93      0.92      1704



### Naive Bayes 

In [115]:
gnb.fit(X_train_bert_pretrained, y_train)

y_pred=gnb.predict(X_test_bert_pretrained)

print(accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

0.7183098591549296
              precision    recall  f1-score   support

           0       0.26      0.77      0.39       199
           1       0.96      0.71      0.82      1505

    accuracy                           0.72      1704
   macro avg       0.61      0.74      0.60      1704
weighted avg       0.88      0.72      0.77      1704



In [116]:
from sklearn.base import BaseEstimator, TransformerMixin
from sentence_transformers import SentenceTransformer

class BertVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model_name = model_name
        self.model = SentenceTransformer(self.model_name)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([self.model.encode(doc) for doc in X])


In [119]:
pipelines = {
    'naive_bayes': Pipeline([
        ('vectorization',  BertVectorizer()),
        ('classifier', GaussianNB())  # MultinomialNB might not work with BERT embeddings
    ], memory=memory),
    'logistic_regression': Pipeline([
        ('vectorization',  BertVectorizer()),
        ('classifier', LogisticRegression())
    ], memory=memory),
    'svc': Pipeline([
        ('vectorization',  BertVectorizer()),
        ('classifier', SVC())
    ], memory=memory)
}

param_grids = {
     'naive_bayes': [
        {
            'vectorization__model_name': ['all-MiniLM-L6-v2'],
            'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]  # example parameter grid for GaussianNB
        }
    ],
    
    'logistic_regression': [
        {
            'vectorization__model_name': ['all-MiniLM-L6-v2'],
            'classifier__C': [0.1, 1, 10], 
            'classifier__penalty': ['l2'], 
            'classifier__solver': ['lbfgs'],
        }
    ],

    
    'svc':[
        {
            'vectorization__model_name': ['all-MiniLM-L6-v2'],
            'classifier__C': [0.1, 1, 10],
        }
    ]
}

best_models = {}

for algo in pipelines.keys():
    print("*"*10, algo, "*"*10)
    grid_search = GridSearchCV(estimator=pipelines[algo], 
                               param_grid=param_grids[algo], 
                               cv=5, 
                               scoring='accuracy', 
                               return_train_score=True,
                               verbose=1
                              )
    
    %time grid_search.fit(X_train, y_train)
    
    best_models[algo] = grid_search.best_estimator_
    
    print('Score on Test Data: ', grid_search.score(X_test, y_test))


********** naive_bayes **********
Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: total: 2h 8min 47s
Wall time: 1h 9min 42s
Score on Test Data:  0.7183098591549296
********** logistic_regression **********
Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: total: 2h 7min 50s
Wall time: 1h 11min 51s
Score on Test Data:  0.9242957746478874
********** svc **********
Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: total: 2h 10min 34s
Wall time: 1h 50min 37s
Score on Test Data:  0.9272300469483568


In [120]:
for name, model in best_models.items():
    print("*"*10, name, "*"*10)
    model_name=f"{name}_bert"
    joblib.dump(model, f'best_models/{model_name}.pkl')
    model = joblib.load(f'best_models/{model_name}.pkl')
    
    %time y_test_pred = model.predict(X_test)
    print("Accuracy Score", metrics.accuracy_score(y_test, y_test_pred))
    
    print("Model Size:", os.path.getsize(f'best_models/{model_name}.pkl'), "Bytes")

********** naive_bayes **********
CPU times: total: 1min 11s
Wall time: 35.9 s
Accuracy Score 0.7183098591549296
Model Size: 91408178 Bytes
********** logistic_regression **********
CPU times: total: 1min 12s
Wall time: 36.8 s
Accuracy Score 0.9242957746478874
Model Size: 91398986 Bytes
********** svc **********
CPU times: total: 1min 13s
Wall time: 37.3 s
Accuracy Score 0.9272300469483568
Model Size: 96650310 Bytes
