In [19]:
import glob
import re
import numpy as np
import pandas as pd
from scipy import sparse
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import nltk

# Parse Reviews (Tokenizers)

In [5]:
def removeAndExpand(text): 
    text.lower()
    text = re.sub(r"(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)", "", text)
    text = re.sub(r"(<br\s*/><br\s*/>)|(\-)|(\/)", '', text)
    text = re.sub(r"[^A-Za-z0-9(),!?@&$\'\`\"\_\n]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"cannot", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = text.replace('&', ' and')
    text = text.replace('@', ' at')
    text = text.replace('$', ' dollar')
    return text.split()

def stopWordRemover(textArr): 
    stopWords = set(stopwords.words('english'))
    return [s.lower() for s in textArr if s not in stopWords]

lemma = lambda u: list(map(lambda v: WordNetLemmatizer().lemmatize(PorterStemmer().stem(v)), u))

bigramParser = lambda u: re.sub("[^a-zA-Z]", " ", u.lower()).split()

# Parse Passages

In [6]:
xPositive = []
xNegative = []
xSeq = []
yContinuous = []
yDiscrete = []

for s in ['pos', 'neg']:
    files = glob.glob('./comp-551-imbd-sentiment-classification/train/'+ s +'/*.txt')
    files.sort(key=lambda x: int(x.split('/')[-1].split('_')[0]))
    for name in files:
        yContinuous.append(int(name.split('_')[1].replace('.txt', '')))
        xSeq.append(int(name.split('/')[-1].split('_')[0]))
        if s == 'pos':
            yDiscrete.append(1)
        else:
            yDiscrete.append(0)
        with open(name) as f:
            text = f.read()
            text.lower()
            text = re.sub(r"[^A-Za-z]", " ", text)
            if s == 'pos':
                xPositive.append(text)
            else:
                xNegative.append(text)

# Utility Function

In [7]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [8]:
df = pd.DataFrame([[x, yd, yc] for x, yd, yc in zip(xPositive + xNegative, yDiscrete, yContinuous)], columns=['text', 'y discrete', 'y continuous'])
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['y discrete'], train_size=0.8, test_size=0.2, random_state=223)

# Experiment With Logistic Regression

In [13]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

## Count

In [22]:
commentLrgAPipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('lrg', LogisticRegression())
])

commentLrgAPipeline.fit(X_train, y_train)
y_pred = commentLrgAPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8826439738769049


### Cross Validation

In [27]:
commentLrgCVAPipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('lrg', LogisticRegression())
])

gridSearchA = GridSearchCV(commentLrgCVAPipeline, {
    'tfidf__ngram_range': [(1, 1), (1, 2)], 
    'tfidf__max_df': [0.5, 0.83, 1], 
    'lrg__C': [0.5, 1, 66, 6666]
}, cv=2, verbose=10, n_jobs=-1)

gridSearchA.fit(X_train, y_train)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ........
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ........
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8684, total=   7.3s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8705, total=   7.7s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .......
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .......
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8688, total=  11.3s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8692, total=  11.1s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) .......
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) .......
[

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   37.2s


[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ..........
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8859, total=  29.5s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ..........
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5715, total=   6.4s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ..........
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5489, total=   6.1s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ..........
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.8833, total=  33.2s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.883, total=  33.4s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ..........


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min


[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ..........
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.7021, total=  18.8s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6935, total=  18.5s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ..........
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ..........
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.869, total=   7.3s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8663, total=   7.3s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8658, total=   8.7s
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8673, total=   8.6s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) .........
[CV] lrg__C=1, tfidf__max_df=0.83

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.9min


[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ............
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8848, total=  32.0s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ............
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5697, total=   7.2s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ............
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5493, total=   6.7s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ............
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.8835, total=  37.3s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.8838, total=  38.0s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.7, total=  16.5s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf_

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.4min


[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .........
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8606, total=   8.2s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ........
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8567, total=   8.1s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ........
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8595, total=   9.2s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) ........
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8574, total=   9.2s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) ........
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8821, total=  32.6s
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8814, total=  32.0s
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ...........
[CV] lrg__C=66, tfidf__max

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.6min


[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.8823, total=  45.9s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8397, total=   8.8s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .......
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8435, total=   9.7s
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6086, total=  21.7s
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6689, total=  21.9s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .......
[CV] lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ......
[CV] lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ......
[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8449, total=  10.6s
[CV]  lrg__C=6666, tfidf

[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed:  4.8min remaining:   12.6s


[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6614, total=  15.3s
[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6078, total=  14.9s


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.0min finished


AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [28]:
report(gridSearchA.cv_results_, n_top=20)

Model with rank: 1
Mean validation score: 0.885 (std: 0.001)
Parameters: {'lrg__C': 0.5, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 2
Mean validation score: 0.884 (std: 0.000)
Parameters: {'lrg__C': 1, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 3
Mean validation score: 0.884 (std: 0.000)
Parameters: {'lrg__C': 1, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2)}

Model with rank: 4
Mean validation score: 0.883 (std: 0.000)
Parameters: {'lrg__C': 0.5, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2)}

Model with rank: 5
Mean validation score: 0.882 (std: 0.000)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2)}

Model with rank: 6
Mean validation score: 0.882 (std: 0.000)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 7
Mean validation score: 0.875 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model wi

In [31]:
commentLrgAFPipeline = Pipeline([
    ('tfidf', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
    ('lrg', LogisticRegression(C=0.5))
])

commentLrgAFPipeline.fit(X_train, y_train)
y_pred = commentLrgAFPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8986593059936909


## TF-IDF

In [8]:
commentLrgBPipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lrg', LogisticRegression())
])

commentLrgBPipeline.fit(X_train, y_train)
y_pred = commentLrgBPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8933569879755568


### Cross Validation

In [32]:
commentLrgCVBPipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lrg', LogisticRegression())
])

gridSearch = GridSearchCV(commentLrgCVBPipeline, {
    'tfidf__sublinear_tf': [True, False], 
    'tfidf__ngram_range': [(1, 1), (1, 2)], 
    'tfidf__norm': ['l1', 'l2'], 
    'tfidf__max_df': [0.5, 0.83, 1], 
    'lrg__C': [0.5, 1, 66, 6666]
}, cv=2, verbose=10, n_jobs=-1)

gridSearch.fit(X_train, y_train)
report(gridSearch.cv_results_, n_top=20)

Fitting 2 folds for each of 96 candidates, totalling 192 fits
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8332, total=   6.5s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8242, total=   6.6s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, t

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   20.0s


[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8726, total=   6.5s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8715, total=   6.5s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8673, total=   6.4s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8481, total=  24.1s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfi

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   55.9s


[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.837, total=  24.3s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8389, total=  24.4s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8748, total=  32.4s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8737, total=  32.2s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfi

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.9min


[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.7874, total=   8.5s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.7811, total=   8.4s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8746, total=   7.4s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8723, total=   7.4s
[CV] lrg__C=0.5, tfidf__max_df=0

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.1min


[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8391, total=  24.6s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8455, total=  25.2s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8072, total=  24.9s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8102, total=  25.3s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.6min


[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.5569, total=   7.1s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.5519, total=   6.9s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.5478, total=   6.3s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.5601, total=   6.4s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1,

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.2min


[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6826, total=  20.0s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6894, total=  20.5s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.6985, total=  21.3s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.7067, total=  21.8s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1,

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  5.0min


[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8795, total=   7.0s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8755, total=   6.9s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.848, total=  28.3s
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8523, total=  28.5s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  6.3min


[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8246, total=   8.2s
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8137, total=   8.0s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.7964, total=   8.1s
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.7898, total=   8.0s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  7.9min


[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8737, total=  31.9s
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8753, total=  32.1s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.5569, total=   6.2s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  8.6min


[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6827, total=  19.2s
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6895, total=  19.5s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.7067, total=  19.3s
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.6984, total=  19.3s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__nor

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 10.1min


[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8725, total=  27.1s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8631, total=  26.4s
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8668, total=  26.7s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8958, total=  28.8s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 11.1min


[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8682, total=  30.3s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8705, total=  30.6s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8553, total=  30.6s
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8588, total=  30.9s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=66, tfidf__max_df=0.83, tfid

[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 13.3min


[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.6979, total=  21.8s
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6989, total=  22.8s
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6944, total=  22.7s
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.7059, total=  21.8s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), 

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 14.8min


[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8923, total=  30.3s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8928, total=  31.1s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8968, total=  38.4s
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.898, total=  38.9s
[CV] lrg__C=6666, tfidf__max_df=0

[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 17.3min


[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8966, total=  28.7s
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8962, total=  29.3s
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.56, total=   5.7s
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__nor

[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 18.5min finished


Model with rank: 1
Mean validation score: 0.898 (std: 0.000)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 2
Mean validation score: 0.898 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False}

Model with rank: 3
Mean validation score: 0.897 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 4
Mean validation score: 0.897 (std: 0.001)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 5
Mean validation score: 0.896 (std: 0.000)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False}

Model with rank: 6
Mean validati

In [7]:
commentLrgBFPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True)),
    ('lrg', LogisticRegression(C=6666))
])

commentLrgBFPipeline.fit(X_train, y_train)
y_pred = commentLrgBFPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9131807419100236


## Remove Stopword

In [25]:
commentLrgCPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: stopWordRemover(x.split()))),
    ('lrg', LogisticRegression(C=6666))
])

commentLrgCPipeline.fit(X_train, y_train)
y_pred = commentLrgCPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9028052153299091


## Lemma and Stemma

In [26]:
commentLrgDPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: lemma(x.split()))),
    ('lrg', LogisticRegression(C=6666))
])

commentLrgDPipeline.fit(X_train, y_train)
y_pred = commentLrgDPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9134387351778656


# Linear SVM

In [11]:
from sklearn import svm

## Count

In [35]:
commentSvmAPipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('lrg', svm.LinearSVC())
])

commentSvmAPipeline.fit(X_train, y_train)
y_pred = commentSvmAPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8684419713831479


In [36]:
commentSvmCVAPipeline = Pipeline([
    ('tfidf', CountVectorizer()),
    ('lrg', svm.LinearSVC())
])

gridSearchAS = GridSearchCV(commentSvmCVAPipeline, {
    'tfidf__ngram_range': [(1, 1), (1, 2)], 
    'tfidf__max_df': [0.5, 0.83, 1], 
    'lrg__C': [0.5, 1, 66, 6666]
}, cv=2, verbose=10, n_jobs=-1)

gridSearchAS.fit(X_train, y_train)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ........
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ........
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ........
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8496, total=   8.6s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8539, total=   8.6s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .......
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .......
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8543, total=  12.6s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8484, total=  12.4s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) .......
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) .......
[

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   50.0s


[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8809, total=  39.3s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ..........
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ..........
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5667, total=   9.0s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5474, total=   8.9s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ..........
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ..........
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.88, total=  48.0s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ..........
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.879, total=  48.8s


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.4min


[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) ..........
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6558, total=  22.0s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6834, total=  22.5s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ..........
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) ..........
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8521, total=  10.6s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8468, total=  10.5s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.853, total=  12.1s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) .........
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8478, total=  12.1s
[CV] lrg__C=1, tfidf__max_df=0.83

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.5min


[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8809, total=  46.3s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ............
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ............
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5561, total=   7.2s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ............
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), score=0.5467, total=   7.2s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2) ............
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.8799, total=  46.7s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.8789, total=  47.6s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .........
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8498, total=   8.9s
[CV] lrg__C=66, tfidf__max_df=0.5, 

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  3.1min


[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .........
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6263, total=  21.9s
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.6754, total=  22.5s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ........
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ........
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8505, total=   9.7s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) ........
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8465, total=  10.3s
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2) ........
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8788, total=  23.7s
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 1) ...........
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), score=0.8811, total=  24.8s
[CV] lrg__C=66, tfidf__max_df=1,

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.0min


[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), score=0.879, total=  28.1s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1) .......
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8499, total=   7.1s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .......
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), score=0.8445, total=   6.9s
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2) .......
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.5377, total=  26.7s
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.5705, total=  27.3s
[CV] lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ......
[CV] lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1) ......
[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), score=0.8465, total=   8.7s
[CV]  lrg__C=6666, tfidf_

[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed:  5.3min remaining:   13.9s


[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.5323, total=  27.9s
[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 2), score=0.5651, total=  28.5s


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  5.5min finished


Model with rank: 1
Mean validation score: 0.898 (std: 0.000)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 2
Mean validation score: 0.898 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': False}

Model with rank: 3
Mean validation score: 0.897 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}



In [41]:
report(gridSearchAS.cv_results_, n_top=20)

Model with rank: 1
Mean validation score: 0.880 (std: 0.001)
Parameters: {'lrg__C': 0.5, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 1
Mean validation score: 0.880 (std: 0.001)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 1
Mean validation score: 0.880 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 4
Mean validation score: 0.880 (std: 0.001)
Parameters: {'lrg__C': 1, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

Model with rank: 5
Mean validation score: 0.879 (std: 0.001)
Parameters: {'lrg__C': 0.5, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2)}

Model with rank: 6
Mean validation score: 0.879 (std: 0.001)
Parameters: {'lrg__C': 1, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2)}

Model with rank: 6
Mean validation score: 0.879 (std: 0.000)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2)}

Model wi

In [42]:
commentLrgAFPipeline = Pipeline([
    ('tfidf', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
    ('lrg', svm.LinearSVC(C=0.5))
])

commentLrgAFPipeline.fit(X_train, y_train)
y_pred = commentLrgAFPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8915140775743255


## TF-IDF

In [37]:
commentSvmBPipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lrg', svm.LinearSVC())
])

commentSvmBPipeline.fit(X_train, y_train)
y_pred = commentSvmBPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8939123537576839


In [38]:
commentSvmCVBPipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lrg', svm.LinearSVC())
])

gridSearch = GridSearchCV(commentSvmCVBPipeline, {
    'tfidf__sublinear_tf': [True, False], 
    'tfidf__ngram_range': [(1, 1), (1, 2)], 
    'tfidf__norm': ['l1', 'l2'], 
    'tfidf__max_df': [0.5, 0.83, 1], 
    'lrg__C': [0.5, 1, 66, 6666]
}, cv=2, verbose=10, n_jobs=-1)

gridSearch.fit(X_train, y_train)
report(gridSearch.cv_results_)

Fitting 2 folds for each of 96 candidates, totalling 192 fits
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8474, total=   8.0s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8418, total=   8.2s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, t

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   23.8s


[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8817, total=   7.6s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8875, total=   7.5s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8808, total=   7.5s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8546, total=  32.2s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfi

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min


[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8421, total=  31.8s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8402, total=  31.4s
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8931, total=  22.9s
[CV]  lrg__C=0.5, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8941, total=  23.2s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfi

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min


[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8359, total=   7.9s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8275, total=   7.7s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8192, total=   7.5s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8888, total=   6.4s
[CV] lrg__C=0.5, tfidf__max_df=0

[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.0min


[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8418, total=  22.2s
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8485, total=  22.6s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.815, total=  22.3s
[CV] lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8158, total=  22.7s
[CV] lrg__C=0.5, tfidf__max_df=0.

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.3min


[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.5539, total=   5.8s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.5578, total=   5.8s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.5536, total=   5.8s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.56, total=   5.8s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.5478, total=   5.7s
[CV] lrg__C=0.5, tfidf__max_d

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min


[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6842, total=  19.8s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6894, total=  20.0s
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.6979, total=  21.3s
[CV]  lrg__C=0.5, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.7066, total=  21.5s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1,

[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  4.7min


[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8782, total=   6.8s
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8842, total=   6.9s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8526, total=  25.7s
[CV]  lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8562, total=  26.0s
[CV] lrg__C=1, tfidf__max_df=0.5, tfidf__ngram_range=(1,

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.9min


[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8542, total=   7.1s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8466, total=   6.8s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8454, total=   6.6s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8379, total=   6.6s
[CV] lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_

[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  7.4min


[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8948, total=  28.8s
[CV]  lrg__C=1, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.895, total=  29.3s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.5595, total=   5.5s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=

[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  8.1min


[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6871, total=  18.9s
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.6905, total=  19.2s
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.7062, total=  17.8s
[CV]  lrg__C=1, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.6979, total=  17.6s
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 1), tfidf__nor

[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 10.3min


[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8861, total= 1.1min
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8908, total= 1.1min
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8882, total= 1.1min
[CV] lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8991, total= 1.0min
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram

[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 12.1min


[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8899, total= 1.2min
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.8903, total= 1.2min
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8856, total= 1.2min
[CV] lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8859, total= 1.2min
[CV] lrg__C=66, tfidf__max_df=0.83, tfid

[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 15.4min


[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.7103, total=  31.4s
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.7011, total=  30.8s
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.711, total=  31.1s
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV] lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.6645, total=  30.5s
[CV]  lrg__C=66, tfidf__max_df=1, tfidf__ngram_range=(1, 2), tfid

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 18.0min


[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True 
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8931, total= 1.3min
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l1, tfidf__sublinear_tf=False, score=0.8918, total= 1.3min
[CV] lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8973, total=  24.5s
[CV]  lrg__C=6666, tfidf__max_df=0.5, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8994, total=  26.0s
[CV] lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=6666, tfidf__max_df

[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 21.0min


[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=True, score=0.8971, total=  29.7s
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True 
[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8974, total=  30.8s
[CV]  lrg__C=6666, tfidf__max_df=0.83, tfidf__ngram_range=(1, 2), tfidf__norm=l2, tfidf__sublinear_tf=False, score=0.8977, total=  29.8s
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV] lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=False 
[CV]  lrg__C=6666, tfidf__max_df=1, tfidf__ngram_range=(1, 1), tfidf__norm=l1, tfidf__sublinear_tf=True, score=0.5253, total=   6.4s
[CV] lrg__C=6666, tfidf__max_df=1, tfid

[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 22.5min finished


Model with rank: 1
Mean validation score: 0.899 (std: 0.001)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 2
Mean validation score: 0.899 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.83, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 3
Mean validation score: 0.898 (std: 0.001)
Parameters: {'lrg__C': 66, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}

Model with rank: 3
Mean validation score: 0.898 (std: 0.001)
Parameters: {'lrg__C': 6666, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}



In [43]:
commentSvmBFPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True)),
    ('lrg', svm.LinearSVC(C=66))
])

commentSvmBFPipeline.fit(X_train, y_train)
y_pred = commentSvmBFPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9135071090047393


## Remove Stopword

In [12]:
commentSvmCPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: stopWordRemover(x.split()))),
    ('lrg', svm.LinearSVC())
])

commentSvmCPipeline.fit(X_train, y_train)
y_pred = commentSvmCPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9018525817895151


## Lemma and Stemma

In [51]:
commentSvmDPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: lemma(x.split()))),
    ('lrg', svm.LinearSVC())
])

commentSvmDPipeline.fit(X_train, y_train)
y_pred = commentSvmDPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9147592738752961


# NBLogisticRegression

In [14]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted

In [15]:
class NBWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1, dual=False, fit_intercept=True, penalty='l2', alpha=1):
        self.classifier = None
        self.X_ = None
        self.y_ = None
        self.r = None
        self.alpha = alpha
        self.C = C
        self.dual = dual
        self.fit_intercept = fit_intercept
        self.penalty = penalty

    def fit(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=True)
        p = self.alpha + X[y == 1].sum(0)
        q = self.alpha + X[y == 0].sum(0)
        p_vn = p / (np.linalg.norm(p))
        q_vn = q / (np.linalg.norm(q))
        self.r = sparse.csr_matrix(np.log(p_vn / q_vn))
        X2 = X.multiply(self.r)
        self.classifier = LogisticRegression(C=self.C, dual=self.dual, fit_intercept=self.fit_intercept, penalty=self.penalty).fit(X2, y)
        self.X_ = X2
        self.y_ = y
        return self

    def predict(self, X):
        check_is_fitted(self, ['X_', 'y_'])
        return self.classifier.predict(X.multiply(self.r))

In [17]:
unionRPipeline = FeatureUnion([
    ('bigram', CountVectorizer()),
    ('unigram', CountVectorizer())
])

commentNBRLrgPipeline = Pipeline([
    ('union', unionRPipeline),
    ('lrg', NBWrapper(C=66, alpha=1))
])

In [18]:
commentNBRLrgPipeline.fit(X_train, y_train)
y_pred = commentNBRLrgPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.875544986127626


In [30]:
unionPipeline = FeatureUnion([
    ('bigram', CountVectorizer(ngram_range=(2, 3), tokenizer=bigramParser, max_df=0.95)),
    ('unigram', CountVectorizer(ngram_range=(1, 1), max_df=0.8))
])

commentNBLrgPipeline = Pipeline([
    ('union', unionPipeline),
    ('lrg', NBWrapper(C=66, alpha=1))
])

## For Cross validation, goto NBLrgCV.py

In [21]:
commentNBLrgPipeline.fit(X_train, y_train)
y_pred = commentNBLrgPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.9174709474098878


# Final submission

In [31]:
import csv
outFiles = glob.glob('./comp-551-imbd-sentiment-classification/test/*.txt')
outFiles.sort(key=lambda x: int(x.split('/')[-1].split('.txt')[0]))
xOut = []
for outFile in outFiles:
    with open(outFile) as f:
        xOut.append(f.read())

outDf = pd.DataFrame(xOut, columns=['text'])

print(">>> Start")
commentNBLrgPipeline.fit(df['text'], df['y discrete'])
print(">>> Predict")
print(">>> Logistic regression")
y_pred = commentNBLrgPipeline.predict(outDf['text'])
print(">>> FOut")
print(">>> File Out")
with open('out_final.csv', 'w') as csvfile:
    fieldnames = ['Id', 'Category']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for c in range(len(y_pred)):
        writer.writerow({'Id': c, 'Category': y_pred[c]})
print(">>> Finish")

>>> Start
>>> Predict
>>> Logistic regression
>>> FOut
>>> File Out
>>> Finish


# Extra: Remove word by POS tag

In [21]:
def labelRemover(textArr, labels): 
    return [s.lower() for s in textArr if nltk.pos_tag(s)[0][1] not in labels]

## Remove Noun

In [22]:
commentLrgPOSAPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: labelRemover(x.split(), ['NN', 'NNP', 'NNS', 'NNPS']))),
    ('lrg', LogisticRegression())
])

commentLrgPOSAPipeline.fit(X_train, y_train)
y_pred = commentLrgPOSAPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8231118122658252


## Remove Adjective

In [23]:
commentLrgPOSNPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: labelRemover(x.split(), ['JJ', 'JJR', 'JJS']))),
    ('lrg', LogisticRegression())
])

commentLrgPOSNPipeline.fit(X_train, y_train)
y_pred = commentLrgPOSNPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8908125122958882


## Remove Verb

In [24]:
commentLrgPOSVPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True, tokenizer = lambda x: labelRemover(x.split(), ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']))),
    ('lrg', LogisticRegression())
])

commentLrgPOSVPipeline.fit(X_train, y_train)
y_pred = commentLrgPOSVPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8928430209569


## None Removed

In [27]:
commentLrgPOSOPipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.83, ngram_range=(1, 2), norm='l2', sublinear_tf=True)),
    ('lrg', LogisticRegression())
])

commentLrgPOSOPipeline.fit(X_train, y_train)
y_pred = commentLrgPOSOPipeline.predict(X_test)
print(metrics.f1_score(y_test, y_pred))

0.8953925252125766
