In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import re
# import mglearn as mglearn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
train_set, test_set = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['Score'])
train_text, train_score = train_set['SumTxt'], train_set['Score']
test_text, test_score = test_set['SumTxt'], test_set['Score']

In [4]:
text, score = train['SumTxt'], train['Score']

In [5]:
stemmer = SnowballStemmer("english")
lem = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [6]:
contractions_dict = { 
    "ain\'t": "am not",
    "aren\'t": "are not",
    "can\'t": "cannot",
    "can\'t\'ve": "cannot have",
    "\'cause": "because",
    "could\'ve": "could have",
    "couldn\'t": "could not",
    "couldn\'t\'ve": "could not have",
    "didn\'t": "did not",
    "doesn\'t": "does not",
    "don\'t": "do not",
    "hadn\'t": "had not",
    "hadn\'t\'ve": "had not have",
    "hasn\'t": "has not",
    "haven\'t": "have not",
    "he\'d": "he had",
    "he\'d\'ve": "he would have",
    "he\'ll": "he shall",
    "he\'ll\'ve": "he shall have",
    "he\'s": "he has",
    "how\'d": "how did",
    "how\'d\'y": "how do you",
    "how\'ll": "how will",
    "how\'s": "how has",
    "I\'d": "I had",
    "I\'d\'ve": "I would have",
    "I\'ll": "I will",
    "I\'ll\'ve": "I will have",
    "I\'m": "I am",
    "I\'ve": "I have",
    "isn\'t": "is not",
    "it\'d": "it would",
    "it\'d\'ve": "it would have",
    "it\'ll": "it will",
    "it\'ll\'ve": "it will have",
    "it\'s": "it is",
    "let\'s": "let us",
    "ma\'am": "madam",
    "mayn\'t": "may not",
    "might\'ve": "might have",
    "mightn\'t": "might not",
    "mightn\'t\'ve": "might not have",
    "must\'ve": "must have",
    "mustn\'t": "must not",
    "mustn\'t\'ve": "must not have",
    "needn\'t": "need not",
    "needn\'t\'ve": "need not have",
    "o\'clock": "of the clock",
    "oughtn\'t": "ought not",
    "oughtn\'t\'ve": "ought not have",
    "shan\'t": "shall not",
    "sha\'n\'t": "shall not",
    "shan\'t\'ve": "shall not have",
    "she\'d": "she had",
    "she\'d\'ve": "she would have",
    "she\'ll": "she will",
    "she\'ll\'ve": "she will have",
    "she\'s": "she is",
    "should\'ve": "should have",
    "shouldn\'t": "should not",
    "shouldn\'t\'ve": "should not have",
    "so\'ve": "so have",
    "so\'s": "so as",
    "that\'d": "that would",
    "that\'d\'ve": "that would have",
    "that\'s": "that is",
    "there\'d": "there had",
    "there\'d\'ve": "there would have",
    "there\'s": "there has",
    "they\'d": "they had",
    "they\'d\'ve": "they would have",
    "they\'ll": "they will",
    "they\'ll\'ve": "they will have",
    "they\'re": "they are",
    "they\'ve": "they have",
    "to\'ve": "to have",
    "wasn\'t": "was not",
    "we\'d": "we had",
    "we\'d\'ve": "we would have",
    "we\'ll": "we will",
    "we\'ll\'ve": "we will have",
    "we\'re": "we are",
    "we\'ve": "we have",
    "weren\'t": "were not",
    "what\'ll": "what will",
    "what\'ll\'ve": "what will have",
    "what\'re": "what are",
    "what\'s": "what is",
    "what\'ve": "what have",
    "when\'s": "when is",
    "when\'ve": "when have",
    "where\'d": "where did",
    "where\'s": "where has",
    "where\'ve": "where have",
    "who\'ll": "who will",
    "who\'ll\'ve": "who will have",
    "who\'s": "who is",
    "who\'ve": "who have",
    "why\'s": "why is",
    "why\'ve": "why have",
    "will\'ve": "will have",
    "won\'t": "will not",
    "won\'t\'ve": "will not have",
    "would\'ve": "would have",
    "wouldn\'t": "would not",
    "wouldn\'t\'ve": "would not have",
    "y\'all": "you all",
    "y\'all\'d": "you all would",
    "y\'all\'d\'ve": "you all would have",
    "y\'all\'re": "you all are",
    "y\'all\'ve": "you all have",
    "you\'d": "you would",
    "you\'d\'ve": "you would have",
    "you\'ll": "you will",
    "you\'ll\'ve": "you will have",
    "you\'re": "you are",
    "you\'ve": "you have"
}

In [7]:
def decontracted(phrase):
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [8]:
def expand_contractions(s, contractions_dict=contractions_dict):
    x = [contractions_dict[con] if con in contractions_dict else con for con in s.split()]
    x = ' '.join(x)
    return decontracted(x)

In [9]:
def remove_digits(mess):
    return ''.join([i for i in mess if not i.isdigit()])

In [10]:
def stm(mess):
    return [stemmer.stem(word) for word in mess]

In [11]:
def wnl(mess):
    return [lem.lemmatize(word) for word in mess]

In [12]:
def swr(mess):
    return ' '.join([word for word in mess.split() if word.lower() not in stop_words])

In [13]:
def punc(mess):
    return mess.translate(str.maketrans('', '', string.punctuation))

In [14]:
def check_char(mess):
    return [word for word in mess.split() if len(word) != 1]

In [15]:
def css(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [16]:
def csl(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [17]:
def nss(mess):
    mess = remove_digits(mess) 
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [18]:
def nsl(mess):
    mess = remove_digits(mess) 
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [19]:
def cns(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [20]:
def cnl(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [21]:
def nns(mess):
    mess = remove_digits(mess) 
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [22]:
def nnl(mess):
    mess = remove_digits(mess) 
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [23]:
l = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = nsl)),
    ('log', LogisticRegression())
])

m = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = nsl)),
    ('mnb', MultinomialNB())
])

s = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = nsl)),
    ('svm', LinearSVC())
])

In [24]:
param_grid_l = {#'tfidf__analyzer': [css, csl, nss, nsl, nns, nnl, cns, cnl],
              'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
              'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
              'tfidf__use_idf': [True, False],
              'log__C': [1, 10, 100],
#               'log__fit_intercept': [True, False]
             }

param_grid_m = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
              'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
              'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
              'tfidf__use_idf': [True, False],
              'mnb__alpha': [0.01, 0.5, 1]
             }

param_grid_s = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
              'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
              'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
              'tfidf__use_idf': [True, False],
              'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
             }

In [25]:
grid_l = GridSearchCV(l, param_grid_l, n_jobs = 7)
grid_m = GridSearchCV(m, param_grid_m, n_jobs = 7)
grid_s = GridSearchCV(s, param_grid_s, n_jobs = 7)

In [31]:
grid_s.fit(train_text, train_score)



GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer=<function nsl at 0x7faa760efb90>,
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                            

In [32]:
print(grid_s.best_params_)

{'svm__C': 10, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}


In [33]:
means = grid_s.cv_results_['mean_test_score']
stds = grid_s.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.2, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.2, 'tfidf__use_idf': False}
0.639 (+/-0.001) for {'svm__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.2, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'svm__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.2, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'svm__C': 10, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'svm__C': 10, 'tfidf__binary': Tr

In [34]:
pred_s = grid_s.predict(test_text)

In [35]:
print(classification_report(test_score, pred_s))

              precision    recall  f1-score   support

           1       0.37      0.01      0.03      7838
           2       0.00      0.00      0.00      4471
           3       0.00      0.00      0.00      6399
           4       0.27      0.00      0.00     12062
           5       0.64      1.00      0.78     54498

    accuracy                           0.64     85268
   macro avg       0.26      0.20      0.16     85268
weighted avg       0.48      0.64      0.50     85268



  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
grid_l.fit(train_text, train_score)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer=<function nsl at 0x7faa760efb90>,
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                            

In [37]:
print(grid_l.best_params_)

{'log__C': 10, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}


In [38]:
means = grid_l.cv_results_['mean_test_score']
stds = grid_l.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_l.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.640 (+/-0.001) for {'log__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.640 (+/-0.001) for {'log__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'log__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.2, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'log__C': 1, 'tfidf__binary': True, 'tfidf__min_df': 0.2, 'tfidf__use_idf': False}
0.640 (+/-0.001) for {'log__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.640 (+/-0.001) for {'log__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'log__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.2, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'log__C': 1, 'tfidf__binary': False, 'tfidf__min_df': 0.2, 'tfidf__use_idf': False}
0.640 (+/-0.001) for {'log__C': 10, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.640 (+/-0.001) for {'log__C': 10, 'tfidf__binary': Tr

In [39]:
pred_l = grid_l.predict(test_text)

In [40]:
print(classification_report(test_score, pred_l))

              precision    recall  f1-score   support

           1       0.39      0.07      0.11      7838
           2       0.00      0.00      0.00      4471
           3       0.20      0.00      0.00      6399
           4       0.30      0.00      0.01     12062
           5       0.64      0.99      0.78     54498

    accuracy                           0.64     85268
   macro avg       0.31      0.21      0.18     85268
weighted avg       0.51      0.64      0.51     85268



  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
grid_m.fit(train_text, train_score)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer=<function nsl at 0x7faa760efb90>,
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                            

In [27]:
print(grid_m.best_params_)

{'mnb__alpha': 0.01, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}


In [28]:
means = grid_m.cv_results_['mean_test_score']
stds = grid_m.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_m.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': True, 'tfidf__min_df': 0.2, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': True, 'tfidf__min_df': 0.2, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': False, 'tfidf__min_df': 0.2, 'tfidf__use_idf': True}
0.639 (+/-0.000) for {'mnb__alpha': 0.01, 'tfidf__binary': False, 'tfidf__min_df': 0.2, 'tfidf__use_idf': False}
0.639 (+/-0.000) for {'mnb__alpha': 0.5, 'tfidf__binary': True, 'tfidf__min_df': 0.1, 'tfidf__use_idf': 

In [29]:
pred_m = grid_m.predict(test_text)

In [30]:
print(classification_report(test_score, pred_m))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00      7838
           2       0.00      0.00      0.00      4471
           3       0.00      0.00      0.00      6399
           4       0.00      0.00      0.00     12062
           5       0.64      1.00      0.78     54498

    accuracy                           0.64     85268
   macro avg       0.13      0.20      0.16     85268
weighted avg       0.41      0.64      0.50     85268



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# tfidfnew = TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8)

In [41]:
s2 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8)),
    ('svm', LinearSVC(loss = 'hinge', C = 10))
])

In [42]:
param_grid_s2 = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1,1), (1,2), (2,2)]
             }

In [43]:
grid_s2 = GridSearchCV(s2, param_grid_s2, n_jobs = 6)

In [44]:
grid_s2.fit(train_text, train_score)



GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [45]:
means = grid_s2.cv_results_['mean_test_score']
stds = grid_s2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.643 (+/-0.004) for {'tfidf__ngram_range': (1, 1)}
0.642 (+/-0.003) for {'tfidf__ngram_range': (1, 2)}
0.639 (+/-0.000) for {'tfidf__ngram_range': (2, 2)}


In [46]:
pred_s2 = grid_s2.predict(test_text)

In [47]:
print(classification_report(test_score, pred_s2))

              precision    recall  f1-score   support

           1       0.47      0.02      0.04      7838
           2       0.26      0.00      0.01      4471
           3       0.24      0.11      0.15      6399
           4       0.19      0.05      0.08     12062
           5       0.68      0.98      0.80     54498

    accuracy                           0.64     85268
   macro avg       0.37      0.23      0.22     85268
weighted avg       0.53      0.64      0.54     85268



In [48]:
print(grid_s2.best_params_)

{'tfidf__ngram_range': (1, 1)}


In [49]:
test_score.head()

235149    5
419454    4
233664    5
295342    5
332103    5
Name: Score, dtype: int64

In [50]:
train_score.head()

9823      4
247438    5
107574    4
37202     1
334756    5
Name: Score, dtype: int64

In [51]:
train.head()

Unnamed: 0,ProdID,NumProdReviews,UserId,NumUserReviews,HelpfulRatio,FoundHelpful,HelpfulVotes,Score,Time,Summary,Text,SumTxt
0,B0034EDLS2,249,AVF82BC7S0IO7,5,0.0,0,0,5,1332806400,Very Good,I received this product early from the seller!...,Very Good I received this product early from t...
1,B001I7HJE4,1,A1YUL9PCJR3JTY,312,1.0,1,1,5,1190160000,"Organic, Kosher, Tasty Assortment of Premium T...",***** Numi's Collection Assortment Melange inc...,"Organic, Kosher, Tasty Assortment of Premium T..."
2,B000LKTB90,44,A1BBPP1EC75JX4,1,0.0,0,0,5,1285977600,"excellent gluten-free spaghetti: great taste, ...","I was very careful not to overcook this pasta,...","excellent gluten-free spaghetti: great taste, ..."
3,B001HXJPS2,3,A5QSI9MNS8NMS,4,0.0,0,0,5,1338163200,Lindt is Lindt,Buying this multi-pack I was misled by the pic...,Lindt is Lindt Buying this multi-pack I was mi...
4,B006H34CUS,251,A20IBAIRSNBEAQ,1,0.0,0,0,5,1343606400,YUM!!!!!,These bars are so good! I loved them warmed up...,YUM!!!!! These bars are so good! I loved them ...


In [63]:
def new_score(score):
    if score == 4 or score == 3 or score == 2 or score == 1:
        return 0
    else:
        return 5

In [64]:
train['score2'] = train.apply(lambda x: new_score(x['Score']), axis = 1)

In [66]:
train.tail()

Unnamed: 0,ProdID,NumProdReviews,UserId,NumUserReviews,HelpfulRatio,FoundHelpful,HelpfulVotes,Score,Time,Summary,Text,SumTxt,score2
426335,B00028LDJ2,5,A28AJSK2CI1XAO,1,1.0,1,1,3,1203379200,i like it,"Like a lot of the gums by Lotte, the flavor do...","i like it Like a lot of the gums by Lotte, the...",0
426336,B000BZZKVS,59,ACHJKSYDTDKMU,2,0.75,3,4,5,1247702400,The Anti-Fatigue,This is a fantastic product. I'm relatively n...,The Anti-Fatigue This is a fantastic product. ...,5
426337,B000U0HJMC,1,A3A7T94SGEKY8A,3,1.0,2,2,5,1231286400,Always the right formula,I trust this brand--the flavors are blended ju...,Always the right formula I trust this brand--t...,5
426338,B000NCW0BC,2,A3B2BB1JFBNAX5,3,0.0,0,0,5,1344124800,Smoked Black Pepper,"This pepper is great! I was buying McCormick, ...",Smoked Black Pepper This pepper is great! I wa...,5
426339,B001D9JC0G,98,A3KZG8XNX5P4HR,5,0.857143,6,7,1,1219104000,Canidae Dog Food made my dogs extremely ill,I have relied on Canidae for my 4 dogs for ove...,Canidae Dog Food made my dogs extremely ill I ...,0


In [67]:
train_set2, test_set2 = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['score2'])
train_text2, train_score2 = train_set2['SumTxt'], train_set2['score2']
test_text2, test_score2 = test_set2['SumTxt'], test_set2['score2']

In [68]:
grid_s3 = GridSearchCV(s2, param_grid_s2, n_jobs = 6)

In [69]:
grid_s3.fit(train_text2, train_score2)



GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [70]:
print(grid_s3.best_params_)

{'tfidf__ngram_range': (1, 2)}


In [71]:
means = grid_s3.cv_results_['mean_test_score']
stds = grid_s3.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s3.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.743 (+/-0.001) for {'tfidf__ngram_range': (1, 1)}
0.744 (+/-0.001) for {'tfidf__ngram_range': (1, 2)}
0.639 (+/-0.000) for {'tfidf__ngram_range': (2, 2)}


In [74]:
pred_s3 = grid_s3.predict(test_text2)

In [75]:
print(classification_report(test_score2, pred_s3))

              precision    recall  f1-score   support

           0       0.69      0.55      0.61     30770
           5       0.77      0.86      0.81     54498

    accuracy                           0.75     85268
   macro avg       0.73      0.70      0.71     85268
weighted avg       0.74      0.75      0.74     85268



In [78]:
train2 = train.loc[train['Score'] != 5]
train2.head()

Unnamed: 0,ProdID,NumProdReviews,UserId,NumUserReviews,HelpfulRatio,FoundHelpful,HelpfulVotes,Score,Time,Summary,Text,SumTxt,score2
7,B0001AVRQK,8,A3G38ANYQ3ZYR3,1,1.0,3,3,2,1276387200,Poor taste,I was really disappointed with the Sorghum we ...,Poor taste I was really disappointed with the ...,0
8,B005GV9RZC,2,A263U9SVO11V75,1,0.0,0,0,3,1349308800,Better than US Instant Coffee,A friend who has gone to Korea gave me a coupl...,Better than US Instant Coffee A friend who has...,0
9,B004FEN3GK,210,A10AKE9TAADHVV,1,0.0,0,0,4,1306454400,Hard not to like!,No need for plastic baggies or sloppy tin foil...,Hard not to like! No need for plastic baggies ...,0
12,B000ED7MR2,7,AT4JRHIZNALRS,1,0.75,3,4,2,1162166400,good company--ok product,I don't care for the flour coating on them and...,good company--ok product I don't care for the ...,0
18,B005K4Q37A,405,A11PM0C1979EZA,3,0.75,3,4,1,1345766400,Plastic taste,This is the first coffee I tried when I got my...,Plastic taste This is the first coffee I tried...,0


In [80]:
train_set3, test_set3 = train_test_split(train2, test_size = 0.2, random_state = 42, stratify = train2['Score'])
train_text3, train_score3 = train_set3['SumTxt'], train_set3['Score']
test_text3, test_score3 = test_set3['SumTxt'], test_set3['Score']

In [81]:
grid_s4 = GridSearchCV(s2, param_grid_s2, n_jobs = 6)

In [82]:
grid_s4.fit(train_text3, train_score3)



GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [83]:
print(grid_s4.best_params_)

{'tfidf__ngram_range': (1, 2)}


In [84]:
means = grid_s4.cv_results_['mean_test_score']
stds = grid_s4.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s4.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.504 (+/-0.003) for {'tfidf__ngram_range': (1, 1)}
0.505 (+/-0.005) for {'tfidf__ngram_range': (1, 2)}
0.233 (+/-0.041) for {'tfidf__ngram_range': (2, 2)}


In [85]:
pred_s4 = grid_s4.predict(test_text3)

In [86]:
print(classification_report(test_score3, pred_s4))

              precision    recall  f1-score   support

           1       0.49      0.66      0.56      7839
           2       0.19      0.02      0.03      4470
           3       0.37      0.09      0.14      6399
           4       0.54      0.82      0.65     12062

    accuracy                           0.51     30770
   macro avg       0.40      0.40      0.35     30770
weighted avg       0.44      0.51      0.43     30770



In [87]:
m2 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8)),
    ('mnb', MultinomialNB(alpha = 0.01))
])

In [88]:
grid_m2 = GridSearchCV(m2, param_grid_s2, n_jobs = 6)

In [89]:
grid_m2.fit(train_text3, train_score3)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [90]:
print(grid_m2.best_params_)

{'tfidf__ngram_range': (1, 2)}


In [91]:
means = grid_m2.cv_results_['mean_test_score']
stds = grid_m2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_m2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.454 (+/-0.002) for {'tfidf__ngram_range': (1, 1)}
0.455 (+/-0.002) for {'tfidf__ngram_range': (1, 2)}
0.399 (+/-0.003) for {'tfidf__ngram_range': (2, 2)}


In [92]:
pred_m2 = grid_m2.predict(test_text3)

In [93]:
print(classification_report(test_score3, pred_m2))

              precision    recall  f1-score   support

           1       0.60      0.31      0.41      7839
           2       0.00      0.00      0.00      4470
           3       0.40      0.00      0.00      6399
           4       0.44      0.96      0.60     12062

    accuracy                           0.46     30770
   macro avg       0.36      0.32      0.25     30770
weighted avg       0.41      0.46      0.34     30770



  _warn_prf(average, modifier, msg_start, len(result))


In [94]:
grid_m3 = GridSearchCV(m2, param_grid_s2, n_jobs = 7)

In [96]:
grid_m3.fit(train_text2, train_score2)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [97]:
print(grid_m3.best_params_)

{'tfidf__ngram_range': (1, 2)}


In [98]:
means = grid_m3.cv_results_['mean_test_score']
stds = grid_m3.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_m3.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.659 (+/-0.001) for {'tfidf__ngram_range': (1, 1)}
0.664 (+/-0.001) for {'tfidf__ngram_range': (1, 2)}
0.639 (+/-0.000) for {'tfidf__ngram_range': (2, 2)}


In [99]:
pred_m3 = grid_m3.predict(test_text2)

In [100]:
print(classification_report(test_score2, pred_m3))

              precision    recall  f1-score   support

           0       0.86      0.08      0.15     30770
           5       0.66      0.99      0.79     54498

    accuracy                           0.66     85268
   macro avg       0.76      0.54      0.47     85268
weighted avg       0.73      0.66      0.56     85268



In [101]:
l2 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8)),
    ('log', LogisticRegression(C = 1))
])

In [102]:
grid_l2 = GridSearchCV(l2, param_grid_s2, n_jobs = 7)

In [103]:
grid_l2.fit(train_text2, train_score2)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [104]:
print(grid_l2.best_params_)

{'tfidf__ngram_range': (1, 2)}


In [105]:
means = grid_l2.cv_results_['mean_test_score']
stds = grid_l2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_l2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.744 (+/-0.002) for {'tfidf__ngram_range': (1, 1)}
0.745 (+/-0.002) for {'tfidf__ngram_range': (1, 2)}
0.643 (+/-0.001) for {'tfidf__ngram_range': (2, 2)}


In [106]:
pred_l2 = grid_l2.predict(test_text2)

In [107]:
print(classification_report(test_score2, pred_l2))

              precision    recall  f1-score   support

           0       0.69      0.55      0.61     30770
           5       0.77      0.86      0.81     54498

    accuracy                           0.75     85268
   macro avg       0.73      0.71      0.71     85268
weighted avg       0.74      0.75      0.74     85268



In [108]:
grid_l3 = GridSearchCV(l2, param_grid_s2, n_jobs = 7)

In [109]:
grid_l3.fit(train_text3, train_score3)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [110]:
print(grid_l3.best_params_)

{'tfidf__ngram_range': (1, 1)}


In [111]:
means = grid_l3.cv_results_['mean_test_score']
stds = grid_l3.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_l3.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.516 (+/-0.004) for {'tfidf__ngram_range': (1, 1)}
0.515 (+/-0.004) for {'tfidf__ngram_range': (1, 2)}
0.401 (+/-0.002) for {'tfidf__ngram_range': (2, 2)}


In [112]:
pred_l3 = grid_l3.predict(test_text3)

In [114]:
print(classification_report(test_score3, pred_l3))

              precision    recall  f1-score   support

           1       0.51      0.63      0.57      7839
           2       0.35      0.05      0.09      4470
           3       0.40      0.21      0.28      6399
           4       0.55      0.78      0.64     12062

    accuracy                           0.52     30770
   macro avg       0.45      0.42      0.39     30770
weighted avg       0.48      0.52      0.47     30770



In [115]:
param_grid_3 = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1,3), (1,2), (3,3)]
             }

In [116]:
grid_l4 = GridSearchCV(l2, param_grid_3, n_jobs = 7)

In [117]:
grid_l4.fit(train_text3, train_score3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                    

In [118]:
print(grid_l4.best_params_)

{'tfidf__ngram_range': (1, 3)}


In [119]:
means = grid_l4.cv_results_['mean_test_score']
stds = grid_l4.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_l4.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.515 (+/-0.004) for {'tfidf__ngram_range': (1, 3)}
0.515 (+/-0.004) for {'tfidf__ngram_range': (1, 2)}
nan (+/-nan) for {'tfidf__ngram_range': (3, 3)}


In [121]:
pred_l4 = grid_l4.predict(test_text3)

In [122]:
print(classification_report(test_score3, pred_l4))

              precision    recall  f1-score   support

           1       0.51      0.63      0.57      7839
           2       0.34      0.05      0.09      4470
           3       0.39      0.21      0.27      6399
           4       0.55      0.78      0.65     12062

    accuracy                           0.52     30770
   macro avg       0.45      0.42      0.39     30770
weighted avg       0.48      0.52      0.47     30770

