In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import re
# import mglearn as mglearn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [2]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
train_set, test_set = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['Score'])
train_text, train_score = train_set['SumTxt'], train_set['Score']
test_text, test_score = test_set['SumTxt'], test_set['Score']

In [4]:
stemmer = SnowballStemmer("english")
lem = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [5]:
contractions_dict = { 
    "ain\'t": "am not",
    "aren\'t": "are not",
    "can\'t": "cannot",
    "can\'t\'ve": "cannot have",
    "\'cause": "because",
    "could\'ve": "could have",
    "couldn\'t": "could not",
    "couldn\'t\'ve": "could not have",
    "didn\'t": "did not",
    "doesn\'t": "does not",
    "don\'t": "do not",
    "hadn\'t": "had not",
    "hadn\'t\'ve": "had not have",
    "hasn\'t": "has not",
    "haven\'t": "have not",
    "he\'d": "he had",
    "he\'d\'ve": "he would have",
    "he\'ll": "he shall",
    "he\'ll\'ve": "he shall have",
    "he\'s": "he has",
    "how\'d": "how did",
    "how\'d\'y": "how do you",
    "how\'ll": "how will",
    "how\'s": "how has",
    "I\'d": "I had",
    "I\'d\'ve": "I would have",
    "I\'ll": "I will",
    "I\'ll\'ve": "I will have",
    "I\'m": "I am",
    "I\'ve": "I have",
    "isn\'t": "is not",
    "it\'d": "it would",
    "it\'d\'ve": "it would have",
    "it\'ll": "it will",
    "it\'ll\'ve": "it will have",
    "it\'s": "it is",
    "let\'s": "let us",
    "ma\'am": "madam",
    "mayn\'t": "may not",
    "might\'ve": "might have",
    "mightn\'t": "might not",
    "mightn\'t\'ve": "might not have",
    "must\'ve": "must have",
    "mustn\'t": "must not",
    "mustn\'t\'ve": "must not have",
    "needn\'t": "need not",
    "needn\'t\'ve": "need not have",
    "o\'clock": "of the clock",
    "oughtn\'t": "ought not",
    "oughtn\'t\'ve": "ought not have",
    "shan\'t": "shall not",
    "sha\'n\'t": "shall not",
    "shan\'t\'ve": "shall not have",
    "she\'d": "she had",
    "she\'d\'ve": "she would have",
    "she\'ll": "she will",
    "she\'ll\'ve": "she will have",
    "she\'s": "she is",
    "should\'ve": "should have",
    "shouldn\'t": "should not",
    "shouldn\'t\'ve": "should not have",
    "so\'ve": "so have",
    "so\'s": "so as",
    "that\'d": "that would",
    "that\'d\'ve": "that would have",
    "that\'s": "that is",
    "there\'d": "there had",
    "there\'d\'ve": "there would have",
    "there\'s": "there has",
    "they\'d": "they had",
    "they\'d\'ve": "they would have",
    "they\'ll": "they will",
    "they\'ll\'ve": "they will have",
    "they\'re": "they are",
    "they\'ve": "they have",
    "to\'ve": "to have",
    "wasn\'t": "was not",
    "we\'d": "we had",
    "we\'d\'ve": "we would have",
    "we\'ll": "we will",
    "we\'ll\'ve": "we will have",
    "we\'re": "we are",
    "we\'ve": "we have",
    "weren\'t": "were not",
    "what\'ll": "what will",
    "what\'ll\'ve": "what will have",
    "what\'re": "what are",
    "what\'s": "what is",
    "what\'ve": "what have",
    "when\'s": "when is",
    "when\'ve": "when have",
    "where\'d": "where did",
    "where\'s": "where has",
    "where\'ve": "where have",
    "who\'ll": "who will",
    "who\'ll\'ve": "who will have",
    "who\'s": "who is",
    "who\'ve": "who have",
    "why\'s": "why is",
    "why\'ve": "why have",
    "will\'ve": "will have",
    "won\'t": "will not",
    "won\'t\'ve": "will not have",
    "would\'ve": "would have",
    "wouldn\'t": "would not",
    "wouldn\'t\'ve": "would not have",
    "y\'all": "you all",
    "y\'all\'d": "you all would",
    "y\'all\'d\'ve": "you all would have",
    "y\'all\'re": "you all are",
    "y\'all\'ve": "you all have",
    "you\'d": "you would",
    "you\'d\'ve": "you would have",
    "you\'ll": "you will",
    "you\'ll\'ve": "you will have",
    "you\'re": "you are",
    "you\'ve": "you have"
}

In [6]:
def decontracted(phrase):
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [7]:
def expand_contractions(s, contractions_dict=contractions_dict):
    x = [contractions_dict[con] if con in contractions_dict else con for con in s.split()]
    x = ' '.join(x)
    return decontracted(x)

In [8]:
def remove_digits(mess):
    return ''.join([i for i in mess if not i.isdigit()])

In [9]:
def stm(mess):
    return [stemmer.stem(word) for word in mess]

In [10]:
def wnl(mess):
    return [lem.lemmatize(word) for word in mess]

In [11]:
def swr(mess):
    return ' '.join([word for word in mess.split() if word.lower() not in stop_words])

In [12]:
def punc(mess):
    return mess.translate(str.maketrans('', '', string.punctuation))

In [13]:
def check_char(mess):
    return [word for word in mess.split() if len(word) != 1]

In [14]:
def css(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [15]:
def csl(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [16]:
def nss(mess):
    mess = remove_digits(mess) 
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [17]:
def nsl(mess):
    mess = remove_digits(mess) 
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [18]:
def cns(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return ' '.join(stm(mess))

In [19]:
def cnl(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [20]:
def nns(mess):
    mess = remove_digits(mess) 
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [21]:
def nnl(mess):
    mess = remove_digits(mess) 
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [22]:
#Following done

In [23]:
# s = Pipeline([
#     ('tfidf', TfidfVectorizer(analyzer = nsl)),
#     ('svm', LinearSVC())
# ])

In [24]:
# param_grid_s = {#'tfidf__analyzer': [css, csl],
# #               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
# #               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
# #               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
# #               'svm__loss': ['hinge']
#              }

In [25]:
# grid_s = GridSearchCV(s, param_grid_s, n_jobs = 7, cv = 5)

#best_params = {'svm__C': 10, 'tfidf__binary': False, 'tfidf__min_df': 0.1, 'tfidf__use_idf': True}

# #              precision    recall  f1-score   support

#            1       0.37      0.01      0.03      7838
#            2       0.00      0.00      0.00      4471
#            3       0.00      0.00      0.00      6399
#            4       0.27      0.00      0.00     12062
#            5       0.64      1.00      0.78     54498

#     accuracy                           0.64     85268
#    macro avg       0.26      0.20      0.16     85268
# weighted avg       0.48      0.64      0.50     85268

In [26]:
# s2 = Pipeline([
#     ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8)),
#     ('svm', LinearSVC(loss = 'hinge', C = 10))
# ])

# param_grid_s2 = {#'tfidf__analyzer': [css, csl],
# #               'tfidf__max_df': [0.8, 0.9, 1],
# #               'tfidf__min_df': [0.1, 0.2],
# #               'tfidf__max_features': [50000, None],
# #               'tfidf__binary': [True, False],
# #               'tfidf__norm': ['l1', 'l2', None],
# #               'tfidf__use_idf': [True, False],
# #               'svm__C': [1, 10, 100],
# #               'svm__loss': ['hinge']
#                 'tfidf__ngram_range': [(1,1), (1,2), (2,2)]
#              }

# grid_s2 = GridSearchCV(s2, param_grid_s2, n_jobs = 6)

# best params = {'tfidf__ngram_range': (1, 1)}

#               precision    recall  f1-score   support

#            1       0.47      0.02      0.04      7838
#            2       0.26      0.00      0.01      4471
#            3       0.24      0.11      0.15      6399
#            4       0.19      0.05      0.08     12062
#            5       0.68      0.98      0.80     54498

#     accuracy                           0.64     85268
#    macro avg       0.37      0.23      0.22     85268
# weighted avg       0.53      0.64      0.54     85268

In [27]:
s = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8, preprocessor=cns)),
    ('svm', LinearSVC(loss = 'hinge', C = 10))
])

In [28]:
param_grid_s = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1,1), (1,2), (1, 3), (2,2), (3,3)]
             }

In [29]:
grid_s = GridSearchCV(s, param_grid_s, cv = 5, n_jobs = -1)

In [30]:
grid_s.fit(train_text, train_score)



GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.8,
                                                        max_features=None,
                                                        min_df=0.1,
                                                        ngram_range=(1, 1),
                                                       

In [31]:
print(grid_s.best_params_)

{'tfidf__ngram_range': (1, 2)}


In [32]:
means = grid_s.cv_results_['mean_test_score']
stds = grid_s.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.644 (+/-0.004) for {'tfidf__ngram_range': (1, 1)}
0.645 (+/-0.002) for {'tfidf__ngram_range': (1, 2)}
0.644 (+/-0.004) for {'tfidf__ngram_range': (1, 3)}
0.635 (+/-0.002) for {'tfidf__ngram_range': (2, 2)}
nan (+/-nan) for {'tfidf__ngram_range': (3, 3)}


In [34]:
pred_s = grid_s.predict(test_text)

In [35]:
print(classification_report(test_score, pred_s))

              precision    recall  f1-score   support

           1       0.26      0.11      0.15      7838
           2       0.22      0.01      0.02      4471
           3       0.28      0.05      0.09      6399
           4       0.19      0.04      0.06     12062
           5       0.68      0.98      0.80     54498

    accuracy                           0.65     85268
   macro avg       0.33      0.24      0.23     85268
weighted avg       0.52      0.65      0.54     85268



In [None]:
s2 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8, preprocessor=cnl)),
    ('svm', LinearSVC(loss = 'hinge', C = 10))
])

In [None]:
param_grid_s2 = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1,1), (1,2), (1, 3), (2,2), (3,3)]
             }

In [None]:
grid_s2 = GridSearchCV(s2, param_grid_s2, cv = 5, n_jobs = -1)

In [None]:
grid_s2.fit(train_text, train_score)

In [None]:
print(grid_s2.best_params_)

In [None]:
means = grid_s2.cv_results_['mean_test_score']
stds = grid_s2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

In [None]:
pred_s2 = grid_s2.predict(test_text)

In [None]:
print(classification_report(test_score, pred_s2))