In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import re
# import mglearn as mglearn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [None]:
train_set, test_set = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['Score'])
train_text, train_score = train_set['SumTxt'], train_set['Score']
test_text, test_score = test_set['SumTxt'], test_set['Score']

In [None]:
stemmer = SnowballStemmer("english")
lem = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
contractions_dict = { 
    "ain\'t": "am not",
    "aren\'t": "are not",
    "can\'t": "cannot",
    "can\'t\'ve": "cannot have",
    "\'cause": "because",
    "could\'ve": "could have",
    "couldn\'t": "could not",
    "couldn\'t\'ve": "could not have",
    "didn\'t": "did not",
    "doesn\'t": "does not",
    "don\'t": "do not",
    "hadn\'t": "had not",
    "hadn\'t\'ve": "had not have",
    "hasn\'t": "has not",
    "haven\'t": "have not",
    "he\'d": "he had",
    "he\'d\'ve": "he would have",
    "he\'ll": "he shall",
    "he\'ll\'ve": "he shall have",
    "he\'s": "he has",
    "how\'d": "how did",
    "how\'d\'y": "how do you",
    "how\'ll": "how will",
    "how\'s": "how has",
    "I\'d": "I had",
    "I\'d\'ve": "I would have",
    "I\'ll": "I will",
    "I\'ll\'ve": "I will have",
    "I\'m": "I am",
    "I\'ve": "I have",
    "isn\'t": "is not",
    "it\'d": "it would",
    "it\'d\'ve": "it would have",
    "it\'ll": "it will",
    "it\'ll\'ve": "it will have",
    "it\'s": "it is",
    "let\'s": "let us",
    "ma\'am": "madam",
    "mayn\'t": "may not",
    "might\'ve": "might have",
    "mightn\'t": "might not",
    "mightn\'t\'ve": "might not have",
    "must\'ve": "must have",
    "mustn\'t": "must not",
    "mustn\'t\'ve": "must not have",
    "needn\'t": "need not",
    "needn\'t\'ve": "need not have",
    "o\'clock": "of the clock",
    "oughtn\'t": "ought not",
    "oughtn\'t\'ve": "ought not have",
    "shan\'t": "shall not",
    "sha\'n\'t": "shall not",
    "shan\'t\'ve": "shall not have",
    "she\'d": "she had",
    "she\'d\'ve": "she would have",
    "she\'ll": "she will",
    "she\'ll\'ve": "she will have",
    "she\'s": "she is",
    "should\'ve": "should have",
    "shouldn\'t": "should not",
    "shouldn\'t\'ve": "should not have",
    "so\'ve": "so have",
    "so\'s": "so as",
    "that\'d": "that would",
    "that\'d\'ve": "that would have",
    "that\'s": "that is",
    "there\'d": "there had",
    "there\'d\'ve": "there would have",
    "there\'s": "there has",
    "they\'d": "they had",
    "they\'d\'ve": "they would have",
    "they\'ll": "they will",
    "they\'ll\'ve": "they will have",
    "they\'re": "they are",
    "they\'ve": "they have",
    "to\'ve": "to have",
    "wasn\'t": "was not",
    "we\'d": "we had",
    "we\'d\'ve": "we would have",
    "we\'ll": "we will",
    "we\'ll\'ve": "we will have",
    "we\'re": "we are",
    "we\'ve": "we have",
    "weren\'t": "were not",
    "what\'ll": "what will",
    "what\'ll\'ve": "what will have",
    "what\'re": "what are",
    "what\'s": "what is",
    "what\'ve": "what have",
    "when\'s": "when is",
    "when\'ve": "when have",
    "where\'d": "where did",
    "where\'s": "where has",
    "where\'ve": "where have",
    "who\'ll": "who will",
    "who\'ll\'ve": "who will have",
    "who\'s": "who is",
    "who\'ve": "who have",
    "why\'s": "why is",
    "why\'ve": "why have",
    "will\'ve": "will have",
    "won\'t": "will not",
    "won\'t\'ve": "will not have",
    "would\'ve": "would have",
    "wouldn\'t": "would not",
    "wouldn\'t\'ve": "would not have",
    "y\'all": "you all",
    "y\'all\'d": "you all would",
    "y\'all\'d\'ve": "you all would have",
    "y\'all\'re": "you all are",
    "y\'all\'ve": "you all have",
    "you\'d": "you would",
    "you\'d\'ve": "you would have",
    "you\'ll": "you will",
    "you\'ll\'ve": "you will have",
    "you\'re": "you are",
    "you\'ve": "you have"
}

In [None]:
def decontracted(phrase):
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def expand_contractions(s, contractions_dict=contractions_dict):
    x = [contractions_dict[con] if con in contractions_dict else con for con in s.split()]
    x = ' '.join(x)
    return decontracted(x)

In [None]:
def remove_digits(mess):
    return ''.join([i for i in mess if not i.isdigit()])

In [None]:
def stm(mess):
    return [stemmer.stem(word) for word in mess]

In [None]:
def wnl(mess):
    return [lem.lemmatize(word) for word in mess]

In [None]:
def swr(mess):
    return ' '.join([word for word in mess.split() if word.lower() not in stop_words])

In [None]:
def punc(mess):
    return mess.translate(str.maketrans('', '', string.punctuation))

In [None]:
def check_char(mess):
    return [word for word in mess.split() if len(word) != 1]

In [None]:
def css(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [None]:
def csl(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [None]:
def nss(mess):
    mess = remove_digits(mess) 
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [None]:
def nsl(mess):
    mess = remove_digits(mess) 
    mess = swr(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [None]:
def cns(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return ' '.join(stm(mess))

In [None]:
def cnl(mess):
    mess = remove_digits(mess) 
    mess = expand_contractions(mess)
    mess = punc(mess)
    mess = check_char(mess)
    return ' '.join(wnl(mess))

In [None]:
def nns(mess):
    mess = remove_digits(mess) 
    mess = punc(mess)
    mess = check_char(mess)
    return stm(mess)

In [None]:
def nnl(mess):
    mess = remove_digits(mess) 
    mess = punc(mess)
    mess = check_char(mess)
    return wnl(mess)

In [None]:
s = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', min_df=0.1, max_df=0.8, preprocessor=cnl)),
    ('svm', LinearSVC(loss = 'hinge', C = 10))
])

In [None]:
param_grid_s = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1,1), (1,2), (1, 3), (2,2), (3,3)]
             }

In [None]:
grid_s = GridSearchCV(s, param_grid_s, cv = 5, n_jobs = -1)

In [None]:
grid_s.fit(train_text, train_score)

In [None]:
print(grid_s.best_params_)

In [None]:
means = grid_s.cv_results_['mean_test_score']
stds = grid_s.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_s.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

In [None]:
pred_s = grid_s.predict(test_text)

In [None]:
print(classification_report(test_score, pred_s))