# Lesson 3
## Bag of Words

#### Import data and check for missings

In [1]:
import pandas as pd

df = pd.read_csv("amazon_cellphones_multiclass.csv")
df

Unnamed: 0,asin,reviewText,overall
0,B007D6J64K,Probably my favorite cover! Super sassy and ve...,5
1,B007D6J64K,This case protects the phone from damage.,5
2,B007D6J64K,Nice,4
3,B007D6J64K,"this was another of my favorite ones, thanks f...",5
4,B007D6J64K,Decent case but not a lot of protection.,5
...,...,...,...
29995,B0096QI0QK,it is so easy to put on your phone and it prot...,5
29996,B0096QI0QK,Much better quality than I expected for the pr...,5
29997,B0096QI0QK,This is one of the best screen protectors I ha...,4
29998,B0096QI0QK,This kit included a microfiber cloth and soft ...,5


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   asin        30000 non-null  object
 1   reviewText  29988 non-null  object
 2   overall     30000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 703.2+ KB


In [2]:
# missing values

df.isna().sum()

asin           0
reviewText    12
overall        0
dtype: int64

#### Create the binary target variable

In [4]:
# variable overall distribution

df.overall.value_counts()

5    17695
4     5366
3     3144
1     2121
2     1674
Name: overall, dtype: int64

In [2]:
# function to binarize overall

def binary(row):
    if row['overall'] > 3:
        val = 1
    elif row['overall'] < 3:
        val = 0
    else:
        val = -1
    return val

In [3]:
# adding the new variable to the dataset

df['bin_y'] = df.apply(binary, axis=1)
df

Unnamed: 0,asin,reviewText,overall,bin_y
0,B007D6J64K,Probably my favorite cover! Super sassy and ve...,5,1
1,B007D6J64K,This case protects the phone from damage.,5,1
2,B007D6J64K,Nice,4,1
3,B007D6J64K,"this was another of my favorite ones, thanks f...",5,1
4,B007D6J64K,Decent case but not a lot of protection.,5,1
...,...,...,...,...
29995,B0096QI0QK,it is so easy to put on your phone and it prot...,5,1
29996,B0096QI0QK,Much better quality than I expected for the pr...,5,1
29997,B0096QI0QK,This is one of the best screen protectors I ha...,4,1
29998,B0096QI0QK,This kit included a microfiber cloth and soft ...,5,1


#### Remove all NaN and split in explicative and dependent: X and y

In [4]:
df_not_na = df[~(df['reviewText'].isna()) & ~(df['bin_y']==-1)]

In [5]:
text_0 = df_not_na['reviewText']

In [9]:
type(text_0)

pandas.core.series.Series

In [10]:
text_0[:5]

0    Probably my favorite cover! Super sassy and ve...
1            This case protects the phone from damage.
2                                                 Nice
3    this was another of my favorite ones, thanks f...
4             Decent case but not a lot of protection.
Name: reviewText, dtype: object

In [11]:
text_0[5]

'This case is so cute the only problem I had with it due to the texture of the case it was hard to get in and out of my pockets'

In [6]:
y = df_not_na['bin_y'].tolist()

In [13]:
type(y)

list

In [14]:
y[:5]

[1, 1, 1, 1, 1]

#### Preprocessing: lowercase, remove punctuation, tokenize, lemmatization

In [15]:
from tqdm import tqdm

# Instantly make your loops show a smart progress meter - just wrap any iterable with tqdm(iterable), and you're done!

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

text = text_0.str.lower().str.replace('[^\w\s]',' ') # RegEx = regular expression

text = text.str.split()

# text = text.apply(lambda x: [lemmatizer.lemmatize(word) for sentence in x for word in sentence])
text = text.apply(lambda x: [lemmatizer.lemmatize(sent) for sent in x])

print(text[5])

  text = text_0.str.lower().str.replace('[^\w\s]',' ') # RegEx = regular expression


['this', 'case', 'is', 'so', 'cute', 'the', 'only', 'problem', 'i', 'had', 'with', 'it', 'due', 'to', 'the', 'texture', 'of', 'the', 'case', 'it', 'wa', 'hard', 'to', 'get', 'in', 'and', 'out', 'of', 'my', 'pocket']


##### Lambda functions:   https://www.w3schools.com/python/python_lambda.asp


In [None]:
def myfunc(n):
  return lambda a : a * n

mydoubler = myfunc(2)
mytripler = myfunc(3)

print(mydoubler(11))
print(mytripler(11))

Use lambda functions when an anonymous function is required for a short period of time, inside another function



##### List Comprehension: https://www.w3schools.com/python/python_lists_comprehension.asp

In [None]:
fruits = ["apple", "banana", "cherry", "kiwi", "mango"]

newlist = []
for x in fruits:
  if "a" in x:
    newlist.append(x)

print(newlist)

In [None]:
fruits = ["apple", "banana", "cherry", "kiwi", "mango"]

newlist = [x for x in fruits if "a" in x]

print(newlist)

#### Create ngrams

NLTK stopwords can be found at [this link](https://gist.github.com/sebleier/554280), downloaded, custiomized and imported as a list

We'll need a new library: gensim

In [None]:
conda install -c anaconda gensim

If you are not using Anaconda:

!pip install -U gensim

In [None]:
conda install python-Levenshtein

In [None]:
!pip install python-Levenshtein

In [8]:
from gensim.models.phrases import Phrases
from nltk.corpus import stopwords

stop = stopwords.words('english')
stop.extend(['good', 'many', 'love', 'excellent', 'would'])

bigram = Phrases(text, min_count=5, threshold=0.2, connector_words=stop)
print(bigram[text[5]])

['this', 'case', 'is', 'so', 'cute', 'the', 'only', 'problem', 'i', 'had', 'with', 'it', 'due', 'to', 'the', 'texture_of_the_case', 'it', 'wa_hard', 'to', 'get', 'in', 'and', 'out', 'of', 'my', 'pocket']


In [None]:
help(Phrases)

Threshold parameter:
<img src='img/phrases_threshold.PNG' width='400'>

In [9]:
bigrams = [bigram[item] for item in text]
ngrams = [bigram[item] for item in bigrams]
print(ngrams[5])

['this', 'case', 'is', 'so', 'cute', 'the', 'only', 'problem', 'i', 'had', 'with', 'it', 'due', 'to', 'the', 'texture_of_the_case', 'it', 'wa_hard', 'to', 'get', 'in', 'and', 'out', 'of', 'my', 'pocket']


#### Remove Stopwords

In [10]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
stop.extend(['good', 'bad', 'dont', 'many', 'love', 'excellent', 'would', 'perfect', 'even', 'great'])

print(ngrams[0])
train_sentences = []
for row in ngrams:
    train_sentences.append(' '.join([item for item in row if item not in stop]))
# train_sentences = [' '.join(item) for item in ngrams]
train_sentences[5]

['probably', 'my', 'favorite', 'cover', 'super', 'sassy', 'and', 'very', 'protective', 'i', 'am', 'very', 'abusive', 'of', 'my', 'phone', 'and', 'this', 'case', 'held_up_very_well', 'after', 'a', 'year', 'the', 'color', 'started', 'to', 'wear', 'a', 'bit', 'but', 'it', 'continued', 'to', 'protect_my_phone', 'very', 'well', 'i', 'would', 'buy', 'it', 'again']


'case cute problem due texture_of_the_case wa_hard get pocket'

#### Save data to file

In [11]:
df_not_na['reviewText'] = train_sentences
df_not_na.to_csv('amazon_cellphones_binary.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_not_na['reviewText'] = train_sentences


### Prepare BoW

Bag of Words:
<img src='img/bow.PNG' width='600'>

Term frequency - inverse document frequency:
<img src='img/tfidf.jpeg' width='400'>

In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=1000)
# vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=1000)

X = vectorizer.fit_transform(train_sentences)

feature_names = vectorizer.get_feature_names()

X = X.toarray()
X = np.array(X)
y = np.array(y)

#### Create train/test split

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

#### Classify data

In [14]:
from tqdm import tqdm
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, recall_score

kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
model = tree.DecisionTreeClassifier(max_leaf_nodes=10, max_depth=5)
# model = LogisticRegression(class_weight=None)
# model = RandomForestClassifier()

cvscores = []
cvrecall = []

for train, test in tqdm(kfold.split(x_train, y_train)):
    model.fit(x_train[train],y_train[train])
    predicted = model.predict(x_train[test])
    scores = accuracy_score(predicted, y_train[test])
    recall = recall_score(predicted, y_train[test])
    cvrecall.append(recall)
    cvscores.append(scores * 100)

print("accuracy: ",cvscores)
print("recall: ",cvrecall)

3it [00:03,  1.19s/it]

accuracy:  [86.90932311621967, 86.41443167305236, 86.66773111927192]
recall:  [0.8769637836943939, 0.872245971719829, 0.8746495134421903]





In [15]:
from sklearn.metrics import classification_report

model.fit(x_train, y_train)
predicted = model.predict(x_test)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.64      0.12      0.20      1138
           1       0.87      0.99      0.93      6916

    accuracy                           0.87      8054
   macro avg       0.75      0.55      0.56      8054
weighted avg       0.84      0.87      0.82      8054



#### Cross validated grid search

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

#cross_validated_grid_search for Random Forest
model = RandomForestClassifier(class_weight='balanced')
param_grid = {'n_estimators': [10, 100],
               'criterion': ['gini', 'entropy'],
               'max_depth': [None, 5, 10],
               'min_samples_split': [2, 10, 20]}

#cross_validated_grid_search for SVC
# model = svm.SVC()
# param_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]}]

grid = GridSearchCV(estimator = model, param_grid = param_grid, cv=3, verbose=2, n_jobs=-1, scoring='f1_weighted')
# Fit the random search model
%time grid_result = grid.fit(x_train, y_train)

#print grid search results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 3 folds for each of 36 candidates, totalling 108 fits
Wall time: 8min 39s
Best: 0.856129 using {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
0.839784 (0.005370) with: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 10}
0.844289 (0.002538) with: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
0.837271 (0.002848) with: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 10}
0.851750 (0.000699) with: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
0.835758 (0.000493) with: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 20, 'n_estimators': 10}
0.849854 (0.001796) with: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 20, 'n_estimators': 100}
0.805314 (0.024144) with: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 10}
0.825892 (0.008594) with: 

In [17]:
best_model = grid.best_estimator_
predicted = best_model.predict(x_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.55      0.40      0.46      1138
           1       0.91      0.94      0.92      6916

    accuracy                           0.87      8054
   macro avg       0.73      0.67      0.69      8054
weighted avg       0.85      0.87      0.86      8054



In [None]:
# the end