In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import re
# import mglearn as mglearn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
train_set, test_set = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['Score'])
train_text, train_score = train_set['Summary'], train_set['Score']
test_text, test_score = test_set['Summary'], test_set['Score']

In [4]:
s = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
    ('svm', LinearSVC(loss = 'hinge', C = 10))
])

In [6]:
param_grid_s = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 3)]
             }

In [7]:
grid_s = GridSearchCV(s, param_grid_s, cv = 5, n_jobs = -1)

In [8]:
grid_s.fit(train_text, train_score)



GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [9]:
print(grid_s.best_params_)

{'tfidf__ngram_range': (1, 3)}


In [10]:
means = grid_s.cv_results_['mean_test_score']
stds = grid_s.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.712 (+/-0.002) for {'tfidf__ngram_range': (1, 1)}
0.763 (+/-0.002) for {'tfidf__ngram_range': (1, 2)}
0.768 (+/-0.002) for {'tfidf__ngram_range': (1, 3)}
0.745 (+/-0.003) for {'tfidf__ngram_range': (2, 2)}
0.728 (+/-0.002) for {'tfidf__ngram_range': (3, 3)}


In [11]:
pred_s = grid_s.predict(test_text)

In [12]:
print(classification_report(test_score, pred_s))

              precision    recall  f1-score   support

           1       0.70      0.71      0.70      7838
           2       0.55      0.42      0.48      4471
           3       0.57      0.50      0.53      6399
           4       0.53      0.42      0.47     12062
           5       0.85      0.92      0.89     54498

    accuracy                           0.77     85268
   macro avg       0.64      0.59      0.61     85268
weighted avg       0.76      0.77      0.76     85268



In [13]:
m = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
    ('mnb', MultinomialNB(alpha = 0.01))
])

In [14]:
param_grid_m = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 3)]
             }

In [15]:
grid_m = GridSearchCV(m, param_grid_m, cv = 5, n_jobs = -1)

In [16]:
grid_m.fit(train_text, train_score)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [17]:
print(grid_m.best_params_)

{'tfidf__ngram_range': (1, 3)}


In [18]:
means = grid_m.cv_results_['mean_test_score']
stds = grid_m.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_m.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.769 (+/-0.001) for {'tfidf__ngram_range': (1, 3)}


In [19]:
pred_m = grid_m.predict(test_text)

In [20]:
print(classification_report(test_score, pred_m))

              precision    recall  f1-score   support

           1       0.70      0.70      0.70      7838
           2       0.55      0.41      0.47      4471
           3       0.58      0.51      0.54      6399
           4       0.57      0.38      0.46     12062
           5       0.84      0.94      0.89     54498

    accuracy                           0.78     85268
   macro avg       0.65      0.59      0.61     85268
weighted avg       0.76      0.78      0.76     85268



In [21]:
l = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
    ('log', LogisticRegression(C = 1))
])

In [22]:
param_grid_l = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 3)]
             }

In [23]:
grid_l = GridSearchCV(l, param_grid_l, cv = 5, n_jobs = -1)

In [24]:
grid_l.fit(train_text, train_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [25]:
print(grid_l.best_params_)

{'tfidf__ngram_range': (1, 3)}


In [26]:
means = grid_l.cv_results_['mean_test_score']
stds = grid_l.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_l.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.749 (+/-0.002) for {'tfidf__ngram_range': (1, 3)}


In [27]:
pred_l = grid_l.predict(test_text)

In [28]:
print(classification_report(test_score, pred_l))

              precision    recall  f1-score   support

           1       0.65      0.65      0.65      7838
           2       0.53      0.25      0.34      4471
           3       0.56      0.37      0.44      6399
           4       0.54      0.24      0.34     12062
           5       0.80      0.96      0.87     54498

    accuracy                           0.75     85268
   macro avg       0.62      0.49      0.53     85268
weighted avg       0.72      0.75      0.72     85268

