In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import re
# import mglearn as mglearn

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from bs4 import BeautifulSoup
from string import digits

In [2]:
train = pd.read_csv('train.csv')
train = train.loc[train['Score'] != 5]
train["SumTxt"] = train["Summary"] + ' ' + train["Text"]

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
test = pd.read_csv('test.csv')
test_labels = pd.read_csv('labels.csv')
test['Score'] = test_labels['Score']
test = test.loc[test['Score'] != 5]
test["SumTxt"] = test["Summary"] + ' ' + test["Text"]

test.Summary.fillna('', inplace=True)
test.Text.fillna('', inplace=True)
test.SumTxt.fillna('', inplace=True)

In [4]:
stop_words = set(stopwords.words("english"))

In [5]:
remove_digits = str.maketrans(digits, ' '*len(digits)) 

In [6]:
remove_punc = str.maketrans(string.punctuation, ' '*len(string.punctuation))

In [7]:
def decontracted(phrase):
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [8]:
def check_char(mess):
    return ' '.join([word for word in mess.split() if len(word) != 1])

def swr(mess):
    return ' '.join([word for word in mess.split() if word not in stop_words])

In [9]:
def text_process(rev):
    rev = BeautifulSoup(rev).get_text(' ')
    rev = decontracted(rev)
    rev = rev.translate(remove_digits)
    rev = rev.translate(remove_punc)
    rev = rev.lower()
    rev = swr(rev)
    return check_char(rev)

In [10]:
train['SumTxt'] = train.apply(lambda x: text_process(x['SumTxt']), axis = 1)

In [11]:
test['SumTxt'] = test.apply(lambda x: text_process(x['SumTxt']), axis = 1)

In [12]:
s = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_df = 0.6)),
    ('mnb', MultinomialNB(alpha = 0.01))
])

In [13]:
param_grid_s = {#'tfidf__analyzer': [css, csl],
#               'tfidf__max_df': [0.8, 0.9, 1],
#               'tfidf__min_df': [0.1, 0.2],
#               'tfidf__max_features': [50000, None],
#               'tfidf__binary': [True, False],
#               'tfidf__norm': ['l1', 'l2', None],
#               'tfidf__use_idf': [True, False],
#               'svm__C': [1, 10, 100],
#               'svm__loss': ['hinge']
                'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)]
             }

In [14]:
grid_s = GridSearchCV(s, param_grid_s, cv = 5, n_jobs = -1)

In [15]:
grid_s.fit(train['SumTxt'], train['Score'])

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.6,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [16]:
pred_s = grid_s.predict(test['SumTxt'])

In [17]:
print(classification_report(test['Score'], pred_s))

              precision    recall  f1-score   support

           1       0.78      0.80      0.79     13075
           2       0.74      0.46      0.57      7416
           3       0.67      0.53      0.59     10647
           4       0.74      0.91      0.82     20346

    accuracy                           0.74     51484
   macro avg       0.73      0.68      0.69     51484
weighted avg       0.74      0.74      0.73     51484



In [18]:
s2 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_df = 0.5)),
    ('mnb', MultinomialNB(alpha = 0.01))
])

In [19]:
grid_s2 = GridSearchCV(s2, param_grid_s, cv = 5, n_jobs = -1)

In [20]:
grid_s2.fit(train['SumTxt'], train['Score'])

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [21]:
pred_s2 = grid_s2.predict(test['SumTxt'])

In [22]:
print(classification_report(test['Score'], pred_s2))

              precision    recall  f1-score   support

           1       0.78      0.80      0.79     13075
           2       0.74      0.46      0.57      7416
           3       0.67      0.53      0.59     10647
           4       0.74      0.91      0.82     20346

    accuracy                           0.74     51484
   macro avg       0.73      0.68      0.69     51484
weighted avg       0.74      0.74      0.73     51484



In [25]:
grid_s.best_params_

{'tfidf__ngram_range': (1, 3)}

In [26]:
grid_s2.best_params_

{'tfidf__ngram_range': (1, 3)}

In [27]:
s3 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', max_df = 0.5)),
    ('mnb', MultinomialNB(alpha = 0.1))
])

In [28]:
grid_s3 = GridSearchCV(s3, param_grid_s, cv = 5, n_jobs = -1)

In [29]:
grid_s3.fit(train['SumTxt'], train['Score'])

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=0.5,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [30]:
grid_s3.best_params_

{'tfidf__ngram_range': (1, 2)}

In [31]:
pred_s3 = grid_s3.predict(test['SumTxt'])

In [32]:
print(classification_report(test['Score'], pred_s3))

              precision    recall  f1-score   support

           1       0.79      0.82      0.80     13075
           2       0.95      0.37      0.54      7416
           3       0.82      0.44      0.57     10647
           4       0.67      0.96      0.79     20346

    accuracy                           0.73     51484
   macro avg       0.81      0.65      0.68     51484
weighted avg       0.77      0.73      0.71     51484

