# Logistic Regression Exercise

In [169]:
import numpy as np
import pandas as pd

In [170]:
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [171]:
import sqlite3
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [172]:
from sklearn.preprocessing import StandardScaler

In [173]:
data = pd.read_pickle("data")
data.head()

Unnamed: 0,Score,Time,CleanedSummary,CleanedText
0,1,944092800,entertainingl funny,beetlejuic well written movi everyth excel act...
1,1,944438400,modern day fairy tale,twist rumplestiskin captur film star michael k...
2,0,948240000,clamshell edition edited version,alway enjoy movi funni entertain didnt hesit p...
3,1,951523200,bettlejuice bettlejuice bettlejuice,happen say name three time michael keaten star...
4,1,961718400,great product,realli good idea final product outstand use de...


### Logistic regression for BOW model

In [174]:
bow_vect = CountVectorizer()
bow = bow_vect.fit_transform(data['CleanedText'].values)
bow.shape

(122110, 42264)

In [175]:
X=bow[:,:20000]
y=data.Score

In [176]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### building the model using L2 regularization

In [177]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [178]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.821281358338


#### L1 regularization

In [179]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

844


##### Top 50 features

In [187]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(bow_vect.get_feature_names()[i])

Top 50 features
aw
disgust
horribl
disappoint
cancel
ined
flavorless
bland
dissapoint
garbag
horrid
disapoint
bewar
gross
elsewher
drinkabl
inferior
edibl
bare
decept
concept
donat
descript
burnt
embarrass
broke
bad
excit
hope
defect
chalki
crap
fail
china
dissappoint
deceiv
idea
dissatisfi
contact
discard
funki
dull
gag
broken
googl
away
earth
cardboard
guess
advertis


In [188]:
y_pred = clf.predict(X_test)

In [190]:
 from sklearn.metrics import accuracy_score

In [194]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

##### Accuracy and confusion matrix

In [197]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

81.5221248601
[[13409  3064]
 [ 3705 16455]]


6.3821128470403545

##### i.e 844 features out of 20000 selected are important features according to l1 norm

In [21]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

175


In [22]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

14


##### therefore as C decreases the model begins to uderfit

In [23]:
clf = LogisticRegression(C=10, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

10392


##### by increasing C the model begins to overfit

### Logistic regression for TFIDF model

In [275]:
tf_idf_vect=TfidfVectorizer()
tf_idf=tf_idf_vect.fit_transform(data['CleanedText'].values)
tf_idf.shape

(122110, 42264)

In [276]:
X=bow[:,:20000]
y=data.Score

In [277]:
y.shape

(122110,)

In [278]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### building the model using L2 regularization

In [279]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [280]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.820653509131


##### L1 regularization

In [281]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

835


##### Top 50 features

In [282]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(tf_idf_vect.get_feature_names()[i])

Top 50 features
aw
horribl
disgust
ined
disappoint
cancel
flavorless
bland
bewar
dissapoint
decept
gross
inferior
garbag
donat
disapoint
horrid
defect
deceiv
dissappoint
elsewher
descript
concept
hope
bare
earth
edibl
broke
bad
excit
fals
contact
drinkabl
burnt
embarrass
fda
cardboard
broken
away
fail
crap
idea
discard
compart
guess
gritti
china
gag
flat
danger


##### accuracy confusion matrix and log loss

In [283]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

81.5002866268
[[13534  3156]
 [ 3621 16322]]


6.3896536721154265

In [284]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

176


In [285]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

15


##### by decreasing C model begins to uderfit.

In [286]:
clf = LogisticRegression(C=1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

3718


In [287]:
clf = LogisticRegression(C=10, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

10347


##### By increasing C the model begins to overfit

### W2V model - 100D

In [288]:
avg_w2v_100 = pd.read_pickle("avg_w2v_vec_100")

In [289]:
avg_w2v_100.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.000778,-0.000347,0.000871,-0.000326,0.000397,0.000361,8.1e-05,0.001098,0.000355,8e-06,...,-0.000193,0.000537,0.000594,0.000397,-0.000584,-0.001169,0.000315,-0.00014,4.9e-05,1
1,0.000567,0.000178,0.000499,0.000169,-0.00026,0.000105,-0.000142,0.000246,-0.000276,-0.000637,...,-0.000212,0.00026,0.00058,0.00021,0.000176,0.000737,0.000288,0.000127,0.000436,1
2,-0.000123,-0.000488,9.4e-05,-0.001271,-0.00124,0.000217,0.001422,-0.000463,-0.000512,0.001132,...,0.000842,0.000572,0.000398,7.2e-05,0.000744,0.000469,-9.2e-05,-0.000362,-6.5e-05,0
3,0.00078,-0.000327,0.000449,-0.000747,-0.00013,-0.000132,0.000667,-2.5e-05,-4.2e-05,0.000242,...,-0.000111,0.000417,3e-06,-0.00019,-0.000268,0.000645,4.9e-05,-0.000225,-0.000452,1
4,0.000114,0.001019,-0.000837,6e-05,-0.000303,-0.000609,-0.000855,0.000182,4.4e-05,0.000346,...,-0.000147,-0.000636,0.000251,0.00105,0.000173,-0.000852,-0.000106,0.000595,6.5e-05,1


In [290]:
avg_w2v_100.shape

(122109, 101)

In [299]:
X=avg_w2v_100.iloc[:,:100].values
y=avg_w2v_100.iloc[:,100].values

##### Column standardizing

In [300]:
s = StandardScaler()
X=s.fit_transform(X)

In [301]:
y.shape

(122109,)

In [302]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Building model using L2 regularization

In [303]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [304]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.707531460705


##### Finding important features using L1 regularization

In [305]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

100


In [306]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(avg_w2v_100.get_feature_names()[i])

Top 50 features
abandon
aaaahhhhhh
aboard
abc
aarrgh
abpv
abid
aauc
abcess
abit
abdi
abod
aamazon
abbott
abalon
aad
abrad
abottl
abnoxi
abotu
abound
abita
aasanfood
aaa
abbrevi
aarti
abba
aap
aaah
abb
abel
abouy
aborb
abouut
abbazabba
ablaz
aaaaaaarrrrrggghhh
aadult
aback
abraham
abe
abd
aaaarrrrghh
aachen
abnorm
abosolut
abouit
aardvark
abov
aaaaaah


In [307]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

70.7886331996
[[11361  4990]
 [ 5711 14571]]


10.089373316486858

##### 99 out of 100 features are important

In [308]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

94


In [309]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

55


In [310]:
clf = LogisticRegression(C=0.0001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

0


##### as C decreases, the model begins to underfit

In [311]:
clf = LogisticRegression(C=10, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

100


##### As C increases, the model begins to overfit

### W2V 200D

In [262]:
avg_w2v_200 = pd.read_pickle("avg_w2v_vec_200")

In [263]:
X=avg_w2v_200.iloc[:,:200].values
y=avg_w2v_200.iloc[:,200].values

##### Column Standardizing

In [264]:
s = StandardScaler()
X=s.fit_transform(X)

In [265]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Building model using L2 regularization

In [266]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [267]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.756094231977


##### L1 regularization

In [268]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

196


In [269]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(avg_w2v_200.get_feature_names()[i])

Top 50 features
abandon
absens
accompli
accidentley
aboard
abdi
aarrgh
absout
absoulut
absolutley
abroadway
abod
absolutey
acciugh
abuelita
accod
abstract
acceptalbl
absorb
abc
absolutelt
acceler
accent
abcess
accordng
aad
acccompani
abound
acadami
absinthett
abit
abund
aamazon
abswer
abysm
abpv
abid
abottl
abalon
accompain
abbrevi
abbott
aback
abut
abnoxi
abotu
abraham
acceptal
absolout
abb


In [270]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

75.6258018726
[[12567  4227]
 [ 4702 15137]]


8.4186524209667315

##### As C decreases the model begins to underfit

In [271]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

173


In [272]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

93


In [273]:
clf = LogisticRegression(C=0.0001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

0


In [274]:
clf = LogisticRegression(C=10, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

200


##### As C increases the model begins to overfit

### AVG W2V 300D 

In [250]:
avg_w2v_300 = pd.read_pickle("avg_w2v_vec_300")

In [251]:
X=avg_w2v_300.iloc[:,:300].values
y=avg_w2v_300.iloc[:,300].values

##### Column standardizing

In [252]:
s = StandardScaler()
X=s.fit_transform(X)

In [253]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Building model using L2 regularization

In [254]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [255]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.780307373134


##### L1 regularization

In [256]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

262


In [257]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(avg_w2v_300.get_feature_names()[i])

Top 50 features
abandon
accompli
accidentley
absens
abroadway
absorb
absolutey
aboard
absoulut
abdi
acceptalbl
acceler
accual
acadami
acdept
aarrgh
abstract
acrospir
aad
achiva
acetaminophen
absolutelt
abod
ackward
absout
aamazon
abuelita
absinthett
acciugh
accod
accoutr
acini
acidosi
acoust
absolutley
abcess
actii
aceton
abe
abit
acccompani
acknowledg
abbrevi
acehardwareoutlet
aadult
abnoxi
abbott
aback
acount
accompain


In [258]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

77.9270057052
[[12976  3932]
 [ 4154 15571]]


7.623832813500746

In [259]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

125


In [260]:
clf = LogisticRegression(C=0.0001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

0


##### As C decreases, model begins to underfit

In [261]:
clf = LogisticRegression(C=10, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

300


##### As C increases the model begins to overfit

### TFIDF Weighted W2V 100D

In [234]:
tfidf_avg_w2v_100 = pd.read_pickle("tfidf_avg_vec_100")

In [235]:
tfidf_avg_w2v_100.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.001005,-0.000993,-0.000176,-1.6e-05,-0.000637,0.001346,0.000667,0.00151,-0.000133,-0.000779,...,0.001775,-2.5e-05,-0.001377,0.00055,-0.000228,-0.001434,-0.000971,0.000378,-0.001934,1
1,0.000447,4.7e-05,-5.1e-05,2e-06,0.000638,0.000663,-0.001048,-0.000657,0.000902,-0.000835,...,-0.000874,-7.6e-05,-0.000138,0.000227,0.00057,-1.4e-05,-0.000653,0.000828,-0.000387,1
2,0.002199,0.000842,-0.001726,-8.4e-05,-0.000796,0.000935,0.000353,5.8e-05,0.001372,-0.000609,...,-0.000939,0.000222,0.000223,-0.000879,-0.001259,-0.001296,0.000505,-0.000455,0.001263,0
3,0.000479,-0.000181,0.000898,0.000729,-0.001107,0.000479,-0.000218,-0.000634,-0.000136,-0.000369,...,-0.000357,0.000199,0.000379,0.00125,-0.000416,0.000148,0.000612,-1.2e-05,0.000545,1
4,0.001807,0.000264,-0.001362,-0.001132,-0.001522,1.2e-05,0.001637,-0.000874,0.000608,0.000235,...,0.001261,0.000399,-0.000921,0.001763,-0.000563,-0.001746,0.001239,0.001081,0.000124,1


In [236]:
X=tfidf_avg_w2v_100.iloc[:,:100].values
y=tfidf_avg_w2v_100.iloc[:,100].values

##### Column standardizing

In [237]:
s = StandardScaler()
X=s.fit_transform(X)

In [238]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Building the model using L2 regularization 

In [239]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [240]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.648868506538


##### L1 regularization

In [241]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

98


In [242]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(tfidf_avg_w2v_100.get_feature_names()[i])

Top 50 features
aap
abou
abhor
ab
abotu
abbott
abbazabba
aahhh
abject
aaaarrrrghh
aasanfood
abdomin
abernook
aaah
abid
aaaaallll
abbay
abra
abdomen
abiet
aad
aappubl
aborb
abit
aboutif
aaaahhhhhh
aberr
abalon
abbot
abat
aagh
aachen
abe
abdi
aaaaaahhhhh
abl
abc
aaaaaaaaagghh
abraham
abnorm
aaaaa
aaaallll
aamazon
abba
abbi
aaaaaaaaaaaaaa
abomin
abrad
aback
abolut


In [243]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

64.9086888871
[[ 9843  5404]
 [ 7451 13935]]


12.120272114132865

In [244]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

95


In [245]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

46


In [246]:
clf = LogisticRegression(C=0.0001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

0


##### As C decreases model begins to underfit

In [247]:
clf = LogisticRegression(C=1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

99


In [249]:
clf = LogisticRegression(C=100, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

100


##### As C increases the model begins to overfit

### TFIDF Weighted W2V 200D

In [214]:
tfidf_avg_w2v_200 = pd.read_pickle("tfidf_avg_vec_200")

In [222]:
X=tfidf_avg_w2v_200.iloc[:,:200].values
y=tfidf_avg_w2v_200.iloc[:,200].values

##### Column Standardizing

In [223]:
s = StandardScaler()
X=s.fit_transform(X)

In [224]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Building model using L2 regularization

In [225]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [226]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.702399475882


##### L1 regularization

In [227]:
clf = LogisticRegression(C=0.01, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

180


##### Top 50 features

In [232]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(tfidf_avg_w2v_200.get_feature_names()[i])

Top 50 features
accentu
academia
accent
abnoxi
absolout
aborio
abhor
aaaaaah
abroad
aaaaallll
aaaallll
abour
accient
abbey
aachen
abdi
abrubt
acciugh
abrotanum
acadami
absolut
access
absolutley
abdomen
abcstor
abit
abv
abat
absentmind
accept
aargh
accordng
abpv
absurt
accidentley
acclaim
abovi
aah
aamazon
accompain
aaaaaaaaagghh
abou
abil
aboard
aaah
acccompani
abita
abuelita
abbrevi
abottl


##### Accuracy confusion matrix and log loss

In [233]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

70.2290284716
[[11170  5006]
 [ 5900 14557]]


10.282658067527477

In [228]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

199


In [229]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

88


In [230]:
clf = LogisticRegression(C=0.0001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

0


##### As C decreases the model begins to underfit

In [231]:
clf = LogisticRegression(C=1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

200


##### As C increases the model begins to overfit

### TFIDF Weighted W2V 300D

In [198]:
tfidf_avg_w2v_300 = pd.read_pickle("tfidf_avg_vec_300")

In [206]:
X=tfidf_avg_w2v_300.iloc[:,:300].values
y=tfidf_avg_w2v_300.iloc[:,300].values

##### Column Standardizing

In [207]:
s = StandardScaler()
X=s.fit_transform(X)

In [208]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

##### Building the model using L2 regularization

In [209]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [210]:
print(model.best_estimator_)
print(model.score(X_test, y_test))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.708732563536


##### L1 regularization

In [211]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

298


##### 298 out of 300 are important features

##### Top 50 features

In [212]:
idx=(clf.coef_).argsort()[:1,:50]
print('Top 50 features')
for i in idx[0]:
    print(tfidf_avg_w2v_300.get_feature_names()[i])

Top 50 features
aaaarrrrghh
aaaaaaaaaaaaaa
accross
accident
actii
absolutley
aaa
acerb
acclim
achiot
accomplish
acrid
acoupl
acc
abililti
aaaaaaarrrrrggghhh
aafco
acidophilus
abscess
accuraci
acoust
acic
accostum
abus
abe
acdept
accompany
account
achill
abit
ace
abrad
accidentley
acai
accutec
abberlin
accompain
achiev
abstin
abosult
acl
absoluet
acetaia
achiva
absent
acetaminophen
absurd
aboout
abnoxi
abscond


##### Accuracy confusion matrix and log loss

In [213]:
y_pred=clf.predict(X_test)
acc=accuracy_score(y_test,y_pred,normalize=True)*float(100)
print(acc)
print(confusion_matrix(y_test,y_pred).T)
log_loss(y_test, y_pred).T

70.8787159119
[[11342  4925]
 [ 5743 14623]]


10.058260548383938

In [157]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

104


In [158]:
clf = LogisticRegression(C=0.0001, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

0


##### As C decreases the model begins to underfit

In [159]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

298


In [160]:
clf = LogisticRegression(C=1, penalty = 'l1')
clf.fit(X_train, y_train)
w = clf.coef_    ## weight vector
print(np.count_nonzero(w))

300


##### As C increases, the model begins to overfit