In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import string
import nltk
import sklearn
import mglearn as mglearn



In [2]:
train = pd.read_csv('train2electricboogaloo.csv')

train.Summary.fillna('', inplace=True)
train.Text.fillna('', inplace=True)
train.SumTxt.fillna('', inplace=True)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [4]:
train_set, test_set = train_test_split(train, test_size = 0.2, random_state = 42, stratify = train['Score'])
train_text, train_score = train_set['SumTxt'], train_set['Score']
test_text, test_score = test_set['SumTxt'], test_set['Score']

In [5]:
text, score = train['SumTxt'], train['Score']

In [6]:
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [7]:
stemmer = SnowballStemmer("english")

In [8]:
def text_process_stm(mess): #ss, swr
    result = ''.join([i for i in mess if not i.isdigit()])
    nopunc = result.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words("english"))
    nopunc2 = [word for word in nopunc.split() if word.lower() not in stop_words]
    nopunc2 = [word.lower() for word in nopunc2]
    nopunc2 = [word for word in nopunc2 if len(word) != 1]
    return [stemmer.stem(word) for word in nopunc2]

In [9]:
# from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# vect = CountVectorizer(analyzer = text_process_stm).fit(train['SumTxt'])

In [11]:
# feature_names = vect.get_feature_names()

In [12]:
# print("First 2000 features:\n{}".format(feature_names[:20000]))

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidfvect = TfidfVectorizer(analyzer = text_process_stm)

In [15]:
svmvecttrain = tfidfvect.fit_transform(train_text)
svmvecttest = tfidfvect.transform(test_text)

In [16]:
from sklearn.svm import LinearSVC

In [17]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'loss': ['hinge']}

In [18]:
grid = GridSearchCV(LinearSVC(), param_grid, n_jobs = 6)

In [19]:
grid.fit(svmvecttrain, train_score)



GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=6,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'loss': ['hinge']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [20]:
print(grid.best_params_)

{'C': 10, 'loss': 'hinge'}


In [21]:
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))

0.639 (+/-0.000) for {'C': 0.001, 'loss': 'hinge'}
0.677 (+/-0.000) for {'C': 0.01, 'loss': 'hinge'}
0.727 (+/-0.002) for {'C': 0.1, 'loss': 'hinge'}
0.755 (+/-0.001) for {'C': 1, 'loss': 'hinge'}
0.768 (+/-0.003) for {'C': 10, 'loss': 'hinge'}
0.758 (+/-0.002) for {'C': 100, 'loss': 'hinge'}
0.713 (+/-0.055) for {'C': 1000, 'loss': 'hinge'}


In [22]:
pred = grid.predict(svmvecttest)

In [23]:
print(classification_report(test_score, pred))

              precision    recall  f1-score   support

           1       0.70      0.75      0.72      7838
           2       0.54      0.32      0.40      4471
           3       0.56      0.42      0.48      6399
           4       0.59      0.32      0.41     12062
           5       0.83      0.95      0.88     54498

    accuracy                           0.77     85268
   macro avg       0.64      0.55      0.58     85268
weighted avg       0.75      0.77      0.75     85268



In [29]:
print(pred[:10])
print(pred[1])
print(pred[0])
print(pred[3])
print(pred[5])

[5 4 5 5 5 4 5 5 5 5]
4
5
5
4
