In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import re
# import mglearn as mglearn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
train_text, train_score = train['Summary'], train['Score']

In [4]:
test = pd.read_csv('test.csv')
test_score = pd.read_csv('labels.csv')

In [5]:
test_text, test_score = test['Summary'], test_score['Score']

In [11]:
test.Summary.fillna('', inplace=True)
test.Text.fillna('', inplace=True)

In [6]:
s = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
    ('svm', LinearSVC(loss = 'hinge', C = 10))
])

In [7]:
param_grid_s = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 3)]
             }

In [8]:
grid_s = GridSearchCV(s, param_grid_s, cv = 5, n_jobs = -1)

In [9]:
grid_s.fit(train_text, train_score)



GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [12]:
pred_s = grid_s.predict(test_text)

In [13]:
print(classification_report(test_score, pred_s))

              precision    recall  f1-score   support

           1       0.70      0.73      0.72     13075
           2       0.53      0.45      0.49      7416
           3       0.59      0.51      0.55     10647
           4       0.56      0.42      0.48     20346
           5       0.86      0.93      0.89     90630

    accuracy                           0.78    142114
   macro avg       0.65      0.61      0.63    142114
weighted avg       0.76      0.78      0.77    142114



In [14]:
m = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
    ('mnb', MultinomialNB(alpha = 0.01))
])

In [15]:
param_grid_m = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 3)]
             }

In [16]:
grid_m = GridSearchCV(m, param_grid_m, cv = 5, n_jobs = -1)

In [17]:
grid_m.fit(train_text, train_score)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [18]:
pred_m = grid_m.predict(test_text)

In [19]:
print(classification_report(test_score, pred_m))

              precision    recall  f1-score   support

           1       0.72      0.71      0.71     13075
           2       0.56      0.42      0.48      7416
           3       0.58      0.51      0.55     10647
           4       0.59      0.39      0.47     20346
           5       0.84      0.94      0.89     90630

    accuracy                           0.78    142114
   macro avg       0.66      0.60      0.62    142114
weighted avg       0.76      0.78      0.77    142114



In [20]:
l = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word')),
    ('log', LogisticRegression(C = 1))
])

In [21]:
param_grid_l = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 3)]
             }

In [22]:
grid_l = GridSearchCV(l, param_grid_l, cv = 5, n_jobs = -1)

In [23]:
grid_l.fit(train_text, train_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [24]:
pred_l = grid_l.predict(test_text)

In [25]:
print(classification_report(test_score, pred_l))

              precision    recall  f1-score   support

           1       0.65      0.68      0.66     13075
           2       0.55      0.23      0.33      7416
           3       0.54      0.38      0.45     10647
           4       0.53      0.30      0.38     20346
           5       0.81      0.95      0.87     90630

    accuracy                           0.75    142114
   macro avg       0.61      0.51      0.54    142114
weighted avg       0.72      0.75      0.72    142114

