In [75]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from nltk.stem.snowball import FrenchStemmer

from pprint import pprint
from time import time
import warnings
import re

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0.1,Unnamed: 0,author,content,sentiment
0,26898,richardepryor,"@treasaint salad stuff, some chillis, whatever...",happiness
1,27635,reese,"@sunnyjamiel sunny, I'm a workin' on it. It's ...",neutral
2,3036,mutedriposte,@jolynnchew so early??,surprise
3,5604,sakizzie_1102,"So now, I have conjunctivitis in my left eye. ...",sadness
4,36111,poptrash,"Out and about in Deal, Kent. More sunshine req...",love


In [4]:
X_train, X_val, y_train, y_val = train_test_split(train.iloc[:,2], train.iloc[:,-1])

In [21]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('lr', LogisticRegression())])

parameters = {'vect__min_df': (5, 7, 10),
              'vect__ngram_range': ((1, 1), (1, 2)),
              'tfidf__use_idf': (True, False),
              'tfidf__norm': ('l1', 'l2'),
              'lr__C': np.logspace(-3, 1.5, 10),
              'lr__penalty': ('l1', 'l2')}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 480 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 17.9min finished


Best score: 0.339
Best parameters set:
	lr__C: 1.0
	lr__penalty: 'l1'
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__min_df: 10
	vect__ngram_range: (1, 1)


In [22]:
accuracy_score(grid_search.predict(X_train), y_train)

0.3925777777777778

In [23]:
accuracy_score(grid_search.predict(X_val), y_val)

0.3688

In [24]:
X_test = pd.read_csv('test.csv')
X_test.head()

Unnamed: 0,id,author,content
0,32823,valicast,Good Morning
1,16298,btb103,I just put my computer up on craigslist. I've ...
2,28505,anavil,in ten minutes shopping demi lovato-back aro...
3,6689,ritwik1st,From twitterberry moved to ubertwitter - suffe...
4,26893,TightFreebies,@thriftymom TEAR*


In [25]:
prediction = grid_search.predict(X_test.iloc[:,2])

In [26]:
csv = pd.DataFrame()
csv["id"] = X_test["id"]
csv["sentiment"] = pd.DataFrame(prediction)
csv.to_csv('14_42.csv', index=None)

**\***

In [117]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('lr', LogisticRegression(class_weight='balanced'))])

parameters = {'vect__min_df': (3, 5, 7),
              'vect__ngram_range': ((1, 1), (1, 2)),
              'tfidf__use_idf': (True, False),
              'tfidf__norm': ('l1', 'l2'),
              'lr__C': np.logspace(-2, 1.5, 15),
              'lr__penalty': ('l1', 'l2')}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 3 folds for each of 720 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 20.5min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed: 29.0min finished


Best score: 0.318
Best parameters set:
	lr__C: 0.05623413251903491
	lr__penalty: 'l2'
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__min_df: 3
	vect__ngram_range: (1, 2)


In [118]:
accuracy_score(grid_search.predict(X_train), y_train)

0.4072

In [119]:
accuracy_score(grid_search.predict(X_val), y_val)

0.3117333333333333

In [52]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

parameters = {
    'vect__max_df': (0.75, 0.85, 0.95),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': np.logspace(-2, 2, 10),
}

grid_search = GridSearchCV(pipeline, parameters, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02]),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.75, 0.85, 0.95),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed: 60.4min finished


done in 3628.198s

Best score: 0.341
Best parameters set:
	clf__C: 0.21544346900318834
	tfidf__norm: 'l2'
	tfidf__use_idf: False
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


In [53]:
accuracy_score(grid_search.predict(X_train), y_train)

0.8076888888888889

In [54]:
accuracy_score(grid_search.predict(X_val), y_val)

0.37

In [55]:
prediction = grid_search.predict(X_test.iloc[:,2])

In [56]:
csv = pd.DataFrame()
csv["id"] = X_test["id"]
csv["sentiment"] = pd.DataFrame(prediction)
csv.to_csv('21_42.csv', index=None)

In [68]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

parameters = {
    'vect__max_df': (0.75, 0.85),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__norm': ('l1', 'l2'),
    'clf__penalty': ('l1', 'l2'),
    'clf__dual': (False,),
    'clf__C': np.logspace(-3, 2, 15),
}

grid_search = GridSearchCV(pipeline, parameters, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': array([1.00000000e-03, 2.27584593e-03, 5.17947468e-03, 1.17876863e-02,
       2.68269580e-02, 6.10540230e-02, 1.38949549e-01, 3.16227766e-01,
       7.19685673e-01, 1.63789371e+00, 3.72759372e+00, 8.48342898e+00,
       1.93069773e+01, 4.39397056e+01, 1.00000000e+02]),
 'clf__dual': (False,),
 'clf__penalty': ('l1', 'l2'),
 'tfidf__norm': ('l1', 'l2'),
 'vect__max_df': (0.75, 0.85),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 240 candidates, totalling 720 fits


[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed: 50.2min finished


done in 3017.724s

Best score: 0.341
Best parameters set:
	clf__C: 0.31622776601683794
	clf__dual: False
	clf__penalty: 'l1'
	tfidf__norm: 'l2'
	vect__max_df: 0.75
	vect__ngram_range: (1, 2)


In [69]:
accuracy_score(grid_search.predict(X_train), y_train)

0.38804444444444447

In [70]:
accuracy_score(grid_search.predict(X_val), y_val)

0.36973333333333336

In [71]:
prediction = grid_search.predict(X_test.iloc[:,2])

In [72]:
csv = pd.DataFrame()
csv["id"] = X_test["id"]
csv["sentiment"] = pd.DataFrame(prediction)
csv.to_csv('23_19.csv', index=None)

In [95]:
train_mod = pd.read_csv('train.csv')
train_mod['content'] = train_mod['content'].str.strip().str.lower().str.replace(r'([a-zA-Z])\1+', r'\1')
train_mod['content'] = train_mod['content'].str.strip().str.lower().str.replace(r'[@,&]', r'')
train_mod.head()

Unnamed: 0.1,Unnamed: 0,author,content,sentiment
0,26898,richardepryor,treasaint salad stuf some chilis whatever my h...,happiness
1,27635,reese,sunyjamiel suny i'm a workin' on it. it's very...,neutral
2,3036,mutedriposte,jolynchew so early??,surprise
3,5604,sakizzie_1102,so now i have conjunctivitis in my left eye. i...,sadness
4,36111,poptrash,out and about in deal kent. more sunshine requ...,love


In [110]:
X_train_mod, X_val_mod, y_train_mod, y_val_mod = train_test_split(train_mod.iloc[:,2], train_mod.iloc[:,-1])

In [111]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

parameters = {
    'vect__max_df': (0.75, 0.85),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__use_idf': (True,),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': np.logspace(-2, 2, 10),
}

grid_search = GridSearchCV(pipeline, parameters, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train_mod, y_train_mod)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02]),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True,),
 'vect__max_df': (0.75, 0.85),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 19.5min finished


done in 1168.866s

Best score: 0.346
Best parameters set:
	clf__C: 0.0774263682681127
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__ngram_range: (1, 1)


In [112]:
accuracy_score(grid_search.predict(X_train_mod), y_train_mod)

0.5400444444444444

In [113]:
accuracy_score(grid_search.predict(X_val_mod), y_val_mod)

0.35546666666666665

In [102]:
X_test_mod = pd.read_csv('test.csv')
X_test_mod['content'] = X_test_mod['content'].str.strip().str.lower().str.replace(r'([a-zA-Z])\1+', r'\1')
X_test_mod['content'] = X_test_mod['content'].str.strip().str.lower().str.replace(r'[@,&]', r'')
X_test_mod.head()

Unnamed: 0,id,author,content
0,32823,valicast,god morning
1,16298,btb103,i just put my computer up on craigslist. i've ...
2,28505,anavil,in ten minutes shoping demi lovato-back arou...
3,6689,ritwik1st,from twiterbery moved to ubertwiter - sufered ...
4,26893,TightFreebies,thriftymom tear*


In [103]:
prediction = grid_search.predict(X_test_mod.iloc[:,2])

In [104]:
csv = pd.DataFrame()
csv["id"] = X_test["id"]
csv["sentiment"] = pd.DataFrame(prediction)
csv.to_csv('00_56.csv', index=None)

In [114]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()),
])

parameters = {
    'vect__max_df': (0.75, 0.85),
    'vect__ngram_range': ((1, 1), (1, 2)),
    'tfidf__use_idf': (True,),
    'tfidf__norm': ('l1', 'l2'),
    'clf__C': np.logspace(-2, 2, 20),
}

grid_search = GridSearchCV(pipeline, parameters, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train_mod, y_train_mod)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': array([1.00000000e-02, 1.62377674e-02, 2.63665090e-02, 4.28133240e-02,
       6.95192796e-02, 1.12883789e-01, 1.83298071e-01, 2.97635144e-01,
       4.83293024e-01, 7.84759970e-01, 1.27427499e+00, 2.06913808e+00,
       3.35981829e+00, 5.45559478e+00, 8.85866790e+00, 1.43844989e+01,
       2.33572147e+01, 3.79269019e+01, 6.15848211e+01, 1.00000000e+02]),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True,),
 'vect__max_df': (0.75, 0.85),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 160 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed: 38.8min finished


done in 2328.618s

Best score: 0.346
Best parameters set:
	clf__C: 0.11288378916846889
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.75
	vect__ngram_range: (1, 1)


In [115]:
accuracy_score(grid_search.predict(X_train_mod), y_train_mod)

0.5921777777777778

In [116]:
accuracy_score(grid_search.predict(X_val_mod), y_val_mod)

0.35306666666666664

In [118]:
X_test_mod = pd.read_csv('test.csv')
X_test_mod['content'] = X_test_mod['content'].str.strip().str.lower().str.replace(r'([a-zA-Z])\1+', r'\1')
X_test_mod['content'] = X_test_mod['content'].str.strip().str.lower().str.replace(r'[@,&,*,#,(,)]', r'')
X_test_mod.head()

Unnamed: 0,id,author,content
0,32823,valicast,god morning
1,16298,btb103,i just put my computer up on craigslist. i've ...
2,28505,anavil,in ten minutes shoping demi lovato-back arou...
3,6689,ritwik1st,from twiterbery moved to ubertwiter - sufered ...
4,26893,TightFreebies,thriftymom tear


In [119]:
prediction = grid_search.predict(X_test_mod.iloc[:,2])

In [121]:
csv = pd.DataFrame()
csv["id"] = X_test["id"]
csv["sentiment"] = pd.DataFrame(prediction)
csv.to_csv('02_06.csv', index=None)