In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPool1D, Conv1D, MaxPool1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

Using TensorFlow backend.


In [2]:
nltk.download("stopwords")
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\len\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# load data
train = pd.read_csv('E://kaggle_train/dataset/train.csv')
test = pd.read_csv('E://kaggle_train/dataset/test.csv')

train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
def multiclass_logloss(label, predicted, eps = 1e-15):
    # Convert 'label' to a binary array if it's not already
    if len(label.shape) == 1:
        actual = np.zeros((label.shape[0], predicted.shape[1]))
        for i, val in enumerate(label):
            actual[i, val] = 1
        label = actual
        
    clip = np.clip(predicted, eps, 1-eps)
    rows = label.shape[0]
    vsota = np.sum(label * np.log(clip))
    return -1.0 / rows * vsota

In [5]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [6]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [7]:
print (xtrain.shape)
print (xvalid.shape)

(17621,)
(1958,)


In [8]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [9]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))



logloss: 0.626 


In [10]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [11]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.528 


In [12]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.578 


In [13]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.485 


In [14]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [15]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.733 


In [16]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.781 


In [17]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.772 


In [18]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.774 


In [19]:
# Fitting a simple xgboost on tf-idf svd features
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.814 


In [20]:
mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

In [21]:
# Initialize SVD
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])

In [22]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}

In [23]:
# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:   33.6s remaining:   33.6s
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:   40.6s remaining:   24.3s
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:   52.7s remaining:   17.5s
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:   57.5s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   59.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   59.5s finished


Best score: -0.740
Best parameters set:
	lr__C: 10
	lr__penalty: 'l1'
	svd__n_components: 180


In [24]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1240s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.2s finished


Best score: -0.492
Best parameters set:
	nb__alpha: 0.1


In [28]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('E://kaggle_train/dataset/glove.840B.300d.txt','rb')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))



0it [00:00, ?it/s]

977it [00:00, 9673.28it/s]

1744it [00:00, 8939.05it/s]

2648it [00:00, 8969.09it/s]

3552it [00:00, 8963.50it/s]

4589it [00:00, 9343.69it/s]

5589it [00:00, 9504.18it/s]

6599it [00:00, 9675.41it/s]

7701it [00:00, 10015.65it/s]

8807it [00:00, 10278.91it/s]

9816it [00:01, 10221.49it/s]

10857it [00:01, 10246.97it/s]

11866it [00:01, 10047.03it/s]

12861it [00:01, 9957.58it/s] 

13850it [00:01, 9937.20it/s]

14887it [00:01, 10033.99it/s]

15927it [00:01, 10141.05it/s]

16981it [00:01, 10227.67it/s]

18040it [00:01, 10333.74it/s]

19129it [00:01, 10464.31it/s]

20176it [00:02, 10311.40it/s]

21208it [00:02, 10161.64it/s]

22226it [00:02, 9702.03it/s] 

23202it [00:02, 8681.87it/s]

24155it [00:02, 8920.02it/s]

25224it [00:02, 9361.60it/s]

26195it [00:02, 9435.88it/s]

27152it [00:02, 8994.78it/s]

28100it [00:02, 9108.72it/s]

29064it [00:03, 9261.85it/s]

30186it [00:03, 9748.09it/s]

31203it [00:03, 9870.94it/s]

32245it [00:03, 10000.61it/s]

33320it [00:03

263064it [00:28, 7448.52it/s]

263872it [00:28, 7627.35it/s]

264723it [00:28, 7850.52it/s]

265514it [00:28, 7488.37it/s]

266400it [00:28, 7832.27it/s]

267193it [00:28, 7504.25it/s]

267953it [00:28, 7422.37it/s]

268896it [00:28, 7908.83it/s]

269805it [00:29, 8229.64it/s]

270641it [00:29, 6863.21it/s]

271375it [00:29, 6748.63it/s]

272282it [00:29, 7292.27it/s]

273273it [00:29, 7919.88it/s]

274106it [00:29, 7969.39it/s]

275084it [00:29, 8438.04it/s]

275956it [00:29, 8026.58it/s]

276798it [00:29, 8140.69it/s]

277738it [00:30, 8458.68it/s]

278599it [00:30, 8164.83it/s]

279510it [00:30, 8427.12it/s]

280364it [00:30, 8216.47it/s]

281457it [00:30, 8856.09it/s]

282364it [00:30, 8764.07it/s]

283255it [00:30, 8337.58it/s]

284226it [00:30, 8706.76it/s]

285162it [00:30, 8867.68it/s]

286250it [00:31, 9388.62it/s]

287396it [00:31, 9901.18it/s]

288405it [00:31, 9453.32it/s]

289499it [00:31, 9855.09it/s]

290502it [00:31, 9705.60it/s]

291586it [00:31, 9992.48it/s]

292596it

537178it [00:55, 8958.38it/s]

538084it [00:55, 8580.03it/s]

539006it [00:56, 8737.59it/s]

539961it [00:56, 8966.41it/s]

540948it [00:56, 9193.87it/s]

542081it [00:56, 9719.98it/s]

543180it [00:56, 10041.46it/s]

544196it [00:56, 9958.21it/s] 

545233it [00:56, 10048.97it/s]

546244it [00:56, 9718.81it/s] 

547330it [00:56, 10035.16it/s]

548400it [00:56, 10196.54it/s]

549426it [00:57, 9558.80it/s] 

550616it [00:57, 10132.42it/s]

551746it [00:57, 10427.61it/s]

552887it [00:57, 10704.09it/s]

553970it [00:57, 8221.77it/s] 

555097it [00:57, 8926.56it/s]

556075it [00:57, 8701.64it/s]

557006it [00:57, 8305.54it/s]

557912it [00:58, 8518.34it/s]

558810it [00:58, 8626.85it/s]

559697it [00:58, 8284.06it/s]

560627it [00:58, 8541.15it/s]

561497it [00:58, 8512.56it/s]

562380it [00:58, 8580.29it/s]

563315it [00:58, 8797.56it/s]

564266it [00:58, 8974.35it/s]

565200it [00:58, 9054.60it/s]

566217it [00:58, 9362.65it/s]

567159it [00:59, 8901.17it/s]

568166it [00:59, 9222.30it/s

802679it [01:23, 10238.89it/s]

803735it [01:23, 10333.15it/s]

804812it [01:23, 10430.05it/s]

805944it [01:23, 10651.84it/s]

807022it [01:23, 10689.96it/s]

808093it [01:23, 10600.69it/s]

809215it [01:23, 10779.18it/s]

810328it [01:24, 10850.26it/s]

811463it [01:24, 10995.49it/s]

812609it [01:24, 11098.50it/s]

813721it [01:24, 10941.05it/s]

814817it [01:24, 10445.93it/s]

815894it [01:24, 10510.22it/s]

817038it [01:24, 10742.55it/s]

818141it [01:24, 10795.40it/s]

819282it [01:24, 10972.71it/s]

820383it [01:25, 10790.11it/s]

821469it [01:25, 10778.80it/s]

822607it [01:25, 10920.84it/s]

823701it [01:25, 10893.95it/s]

824873it [01:25, 11097.66it/s]

825985it [01:25, 10506.33it/s]

827117it [01:25, 10707.41it/s]

828195it [01:25, 9533.53it/s] 

829177it [01:25, 9316.59it/s]

830130it [01:25, 9136.80it/s]

831183it [01:26, 9488.73it/s]

832174it [01:26, 9611.30it/s]

833197it [01:26, 9760.89it/s]

834184it [01:26, 9793.37it/s]

835220it [01:26, 9928.12it/s]

836351it [01:26

1076433it [01:50, 10103.88it/s]

1077535it [01:50, 10333.17it/s]

1078588it [01:50, 10360.77it/s]

1079725it [01:50, 10159.16it/s]

1080851it [01:50, 10437.03it/s]

1081945it [01:50, 10582.99it/s]

1083008it [01:50, 10055.73it/s]

1084187it [01:51, 10491.88it/s]

1085247it [01:51, 9378.76it/s] 

1086215it [01:51, 8495.07it/s]

1087127it [01:51, 8409.45it/s]

1088200it [01:51, 8970.36it/s]

1089350it [01:51, 9580.14it/s]

1090392it [01:51, 9817.53it/s]

1091554it [01:51, 10269.39it/s]

1092671it [01:51, 10523.94it/s]

1093800it [01:52, 10712.05it/s]

1094884it [01:52, 9762.59it/s] 

1095886it [01:52, 9188.80it/s]

1096914it [01:52, 9464.82it/s]

1097880it [01:52, 9275.67it/s]

1098822it [01:52, 8747.39it/s]

1100063it [01:52, 9574.91it/s]

1101155it [01:52, 9942.30it/s]

1102187it [01:52, 10023.39it/s]

1103238it [01:53, 10164.57it/s]

1104320it [01:53, 10323.07it/s]

1105453it [01:53, 10605.84it/s]

1106523it [01:53, 9206.56it/s] 

1107484it [01:53, 7457.28it/s]

1108693it [01:53, 8408

1344294it [02:19, 8963.25it/s]

1345253it [02:19, 9039.10it/s]

1346201it [02:19, 9114.12it/s]

1347222it [02:19, 9391.38it/s]

1348478it [02:19, 10160.35it/s]

1349628it [02:19, 10499.45it/s]

1350706it [02:19, 10520.10it/s]

1351933it [02:19, 10960.87it/s]

1353105it [02:20, 11146.18it/s]

1354270it [02:20, 10731.00it/s]

1355402it [02:20, 10869.76it/s]

1356577it [02:20, 11119.66it/s]

1357804it [02:20, 11409.54it/s]

1358990it [02:20, 11541.04it/s]

1360150it [02:20, 11355.01it/s]

1361360it [02:20, 11568.69it/s]

1362554it [02:20, 11643.48it/s]

1363722it [02:21, 11550.68it/s]

1364880it [02:21, 11524.95it/s]

1366051it [02:21, 11579.84it/s]

1367211it [02:21, 11482.67it/s]

1368465it [02:21, 11780.65it/s]

1369692it [02:21, 11888.66it/s]

1370904it [02:21, 11921.83it/s]

1372098it [02:21, 11513.24it/s]

1373262it [02:21, 11516.70it/s]

1374417it [02:21, 11458.07it/s]

1375574it [02:22, 11491.41it/s]

1376725it [02:22, 10908.59it/s]

1377909it [02:22, 11172.25it/s]

1379158it [02:

1629155it [02:46, 11632.38it/s]

1630323it [02:46, 11577.36it/s]

1631484it [02:46, 11483.98it/s]

1632674it [02:47, 11605.70it/s]

1633837it [02:47, 11543.82it/s]

1634993it [02:47, 11479.85it/s]

1636176it [02:47, 11548.78it/s]

1637366it [02:47, 11651.95it/s]

1638548it [02:47, 11667.21it/s]

1639716it [02:47, 11532.76it/s]

1640914it [02:47, 11629.41it/s]

1642125it [02:47, 11769.53it/s]

1643303it [02:47, 11667.74it/s]

1644471it [02:48, 11636.52it/s]

1645636it [02:48, 11502.64it/s]

1646832it [02:48, 11636.13it/s]

1648075it [02:48, 11829.57it/s]

1649260it [02:48, 11765.19it/s]

1650459it [02:48, 11796.81it/s]

1651727it [02:48, 12048.57it/s]

1653010it [02:48, 12237.71it/s]

1654236it [02:48, 12063.66it/s]

1655445it [02:48, 11893.43it/s]

1656637it [02:49, 11830.51it/s]

1657822it [02:49, 11661.63it/s]

1659050it [02:49, 11840.50it/s]

1660241it [02:49, 11825.94it/s]

1661425it [02:49, 11724.71it/s]

1662599it [02:49, 11522.08it/s]

1663753it [02:49, 10490.07it/s]

1664852it 

1852240it [03:25, 4463.91it/s]

1852953it [03:25, 3431.05it/s]

1853519it [03:25, 2624.32it/s]

1853966it [03:25, 2465.11it/s]

1854344it [03:26, 2369.34it/s]

1854674it [03:26, 2564.19it/s]

1855000it [03:26, 2443.87it/s]

1855295it [03:26, 2098.20it/s]

1855628it [03:26, 2350.11it/s]

1856170it [03:26, 2831.18it/s]

1857390it [03:26, 3675.36it/s]

1858588it [03:26, 4635.00it/s]

1859685it [03:27, 5606.26it/s]

1860610it [03:27, 6344.48it/s]

1861790it [03:27, 7366.16it/s]

1862802it [03:27, 8001.93it/s]

1863975it [03:27, 8845.31it/s]

1865183it [03:27, 9595.02it/s]

1866347it [03:27, 10102.50it/s]

1867546it [03:27, 10603.25it/s]

1868681it [03:27, 10259.44it/s]

1869762it [03:27, 9957.95it/s] 

1870923it [03:28, 10374.11it/s]

1872119it [03:28, 10774.69it/s]

1873369it [03:28, 11240.11it/s]

1874644it [03:28, 11622.28it/s]

1875827it [03:28, 11513.26it/s]

1877026it [03:28, 11618.39it/s]

1878318it [03:28, 11980.47it/s]

1879527it [03:28, 11941.92it/s]

1880729it [03:28, 11720.23it

2135703it [03:52, 10572.41it/s]

2136798it [03:52, 10682.92it/s]

2137996it [03:52, 11011.12it/s]

2139105it [03:52, 10840.51it/s]

2140413it [03:52, 11397.60it/s]

2141565it [03:52, 10914.08it/s]

2142925it [03:52, 11571.83it/s]

2144110it [03:53, 11653.90it/s]

2145290it [03:53, 10798.19it/s]

2146392it [03:53, 10257.09it/s]

2147610it [03:53, 10738.57it/s]

2148812it [03:53, 11093.36it/s]

2149939it [03:53, 10887.35it/s]

2151041it [03:53, 10735.21it/s]

2152125it [03:53, 10247.36it/s]

2153334it [03:53, 10709.82it/s]

2154533it [03:54, 11064.22it/s]

2155709it [03:54, 11231.88it/s]

2156842it [03:54, 10779.03it/s]

2157931it [03:54, 10622.24it/s]

2159002it [03:54, 10163.38it/s]

2160147it [03:54, 10517.94it/s]

2161209it [03:54, 10243.14it/s]

2162342it [03:54, 10517.29it/s]

2163402it [03:54, 10356.56it/s]

2164619it [03:54, 10812.30it/s]

2165710it [03:55, 10377.38it/s]

2166759it [03:55, 10349.28it/s]

2168081it [03:55, 11042.72it/s]

2169204it [03:55, 10686.37it/s]

2170362it 

Found 2196016 word vectors.


In [36]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().encode('utf-8').decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [37]:
# create sentence vectors using the above function for training and validation set
nltk.download('punkt')

xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\len\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!






  0%|                                                | 0/17621 [00:00<?, ?it/s]





  0%|                                      | 1/17621 [00:00<1:08:25,  4.29it/s]





  0%|                                        | 8/17621 [00:00<49:08,  5.97it/s]





  0%|▏                                      | 85/17621 [00:00<34:22,  8.50it/s]





  2%|▌                                     | 283/17621 [00:00<23:49, 12.13it/s]





  3%|█                                     | 495/17621 [00:00<16:31, 17.28it/s]





  4%|█▌                                    | 753/17621 [00:00<11:25, 24.62it/s]





  6%|██▏                                   | 996/17621 [00:00<07:54, 35.01it/s]





  7%|██▌                                  | 1199/17621 [00:00<05:30, 49.63it/s]





  8%|███                                  | 1440/17621 [00:01<03:50, 70.29it/s]

In [38]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [39]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.088 


In [40]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.088 


In [41]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [42]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [43]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [44]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))

Instructions for updating:
Use tf.cast instead.
Train on 17621 samples, validate on 1958 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16e17748>

In [45]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [46]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector







  0%|                                                | 0/25943 [00:00<?, ?it/s]





  0%|▏                                    | 118/25943 [00:00<00:31, 825.17it/s]





 47%|███████████████▉                  | 12136/25943 [00:00<00:11, 1175.29it/s]





 79%|██████████████████████████▋       | 20399/25943 [00:00<00:03, 1668.41it/s]





100%|█████████████████████████████████| 25943/25943 [00:00<00:00, 60899.08it/s]

In [47]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [48]:
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100


Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100


Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100


Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100


Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100


Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100


Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0xf02b32b0>

In [49]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100




<keras.callbacks.History at 0x106e5af98>

In [50]:
# A simple bidirectional LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100


Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


<keras.callbacks.History at 0x1c03cb6a0>

In [51]:
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


<keras.callbacks.History at 0x1c3a05c88>

In [52]:
# this is the main ensembling class. how to use it is in the next cell!
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import os
import sys
import logging

logging.basicConfig(
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S", stream=sys.stdout)
logger = logging.getLogger(__name__)


class Ensembler(object):
    def __init__(self, model_dict, num_folds=3, task_type='classification', optimize=roc_auc_score,
                 lower_is_better=False, save_path=None):
        """
        Ensembler init function
        :param model_dict: model dictionary, see README for its format
        :param num_folds: the number of folds for ensembling
        :param task_type: classification or regression
        :param optimize: the function to optimize for, e.g. AUC, logloss, etc. Must have two arguments y_test and y_pred
        :param lower_is_better: is lower value of optimization function better or higher
        :param save_path: path to which model pickles will be dumped to along with generated predictions, or None
        """

        self.model_dict = model_dict
        self.levels = len(self.model_dict)
        self.num_folds = num_folds
        self.task_type = task_type
        self.optimize = optimize
        self.lower_is_better = lower_is_better
        self.save_path = save_path

        self.training_data = None
        self.test_data = None
        self.y = None
        self.lbl_enc = None
        self.y_enc = None
        self.train_prediction_dict = None
        self.test_prediction_dict = None
        self.num_classes = None

    def fit(self, training_data, y, lentrain):
        """
        :param training_data: training data in tabular format
        :param y: binary, multi-class or regression
        :return: chain of models to be used in prediction
        """

        self.training_data = training_data
        self.y = y

        if self.task_type == 'classification':
            self.num_classes = len(np.unique(self.y))
            logger.info("Found %d classes", self.num_classes)
            self.lbl_enc = LabelEncoder()
            self.y_enc = self.lbl_enc.fit_transform(self.y)
            kf = StratifiedKFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, self.num_classes)
        else:
            self.num_classes = -1
            self.y_enc = self.y
            kf = KFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, 1)

        self.train_prediction_dict = {}
        for level in range(self.levels):
            self.train_prediction_dict[level] = np.zeros((train_prediction_shape[0],
                                                          train_prediction_shape[1] * len(self.model_dict[level])))

        for level in range(self.levels):

            if level == 0:
                temp_train = self.training_data
            else:
                temp_train = self.train_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):
                validation_scores = []
                foldnum = 1
                for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
                    logger.info("Training Level %d Fold # %d. Model # %d", level, foldnum, model_num)

                    if level != 0:
                        l_training_data = temp_train[train_index]
                        l_validation_data = temp_train[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    else:
                        l0_training_data = temp_train[0][model_num]
                        if type(l0_training_data) == list:
                            l_training_data = [x[train_index] for x in l0_training_data]
                            l_validation_data = [x[valid_index] for x in l0_training_data]
                        else:
                            l_training_data = l0_training_data[train_index]
                            l_validation_data = l0_training_data[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])

                    logger.info("Predicting Level %d. Fold # %d. Model # %d", level, foldnum, model_num)

                    if self.task_type == 'classification':
                        temp_train_predictions = model.predict_proba(l_validation_data)
                        self.train_prediction_dict[level][valid_index,
                        (model_num * self.num_classes):(model_num * self.num_classes) +
                                                       self.num_classes] = temp_train_predictions

                    else:
                        temp_train_predictions = model.predict(l_validation_data)
                        self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
                    validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
                    validation_scores.append(validation_score)
                    logger.info("Level %d. Fold # %d. Model # %d. Validation Score = %f", level, foldnum, model_num,
                                validation_score)
                    foldnum += 1
                avg_score = np.mean(validation_scores)
                std_score = np.std(validation_scores)
                logger.info("Level %d. Model # %d. Mean Score = %f. Std Dev = %f", level, model_num,
                            avg_score, std_score)

            logger.info("Saving predictions for level # %d", level)
            train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
            train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_" + str(level) + ".csv"),
                                        index=False, header=None)

        return self.train_prediction_dict

    def predict(self, test_data, lentest):
        self.test_data = test_data
        if self.task_type == 'classification':
            test_prediction_shape = (lentest, self.num_classes)
        else:
            test_prediction_shape = (lentest, 1)

        self.test_prediction_dict = {}
        for level in range(self.levels):
            self.test_prediction_dict[level] = np.zeros((test_prediction_shape[0],
                                                         test_prediction_shape[1] * len(self.model_dict[level])))
        self.test_data = test_data
        for level in range(self.levels):
            if level == 0:
                temp_train = self.training_data
                temp_test = self.test_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
                temp_test = self.test_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):

                logger.info("Training Fulldata Level %d. Model # %d", level, model_num)
                if level == 0:
                    model.fit(temp_train[0][model_num], self.y_enc)
                else:
                    model.fit(temp_train, self.y_enc)

                logger.info("Predicting Test Level %d. Model # %d", level, model_num)

                if self.task_type == 'classification':
                    if level == 0:
                        temp_test_predictions = model.predict_proba(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict_proba(temp_test)
                    self.test_prediction_dict[level][:, (model_num * self.num_classes): (model_num * self.num_classes) +
                                                                                        self.num_classes] = temp_test_predictions

                else:
                    if level == 0:
                        temp_test_predictions = model.predict(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict(temp_test)
                    self.test_prediction_dict[level][:, model_num] = temp_test_predictions

            test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
            test_predictions_df.to_csv(os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
                                       index=False, header=None)

        return self.test_prediction_dict

In [53]:
# specify the data to be used for every level of ensembling:
train_data_dict = {0: [xtrain_tfv, xtrain_ctv, xtrain_tfv, xtrain_ctv], 1: [xtrain_glove]}
test_data_dict = {0: [xvalid_tfv, xvalid_ctv, xvalid_tfv, xvalid_ctv], 1: [xvalid_glove]}

model_dict = {0: [LogisticRegression(), LogisticRegression(), MultinomialNB(alpha=0.1), MultinomialNB()],

              1: [xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)]}

ens = Ensembler(model_dict=model_dict, num_folds=3, task_type='classification',
                optimize=multiclass_logloss, lower_is_better=True, save_path='')

ens.fit(train_data_dict, ytrain, lentrain=xtrain_glove.shape[0])
preds = ens.predict(test_data_dict, lentest=xvalid_glove.shape[0])

[16:21:04] INFO Found 3 classes
[16:21:04] INFO Training Level 0 Fold # 1. Model # 0




[16:21:08] INFO Predicting Level 0. Fold # 1. Model # 0
[16:21:08] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.679328
[16:21:08] INFO Training Level 0 Fold # 2. Model # 0
[16:21:11] INFO Predicting Level 0. Fold # 2. Model # 0
[16:21:11] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.670841
[16:21:11] INFO Training Level 0 Fold # 3. Model # 0
[16:21:13] INFO Predicting Level 0. Fold # 3. Model # 0
[16:21:13] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.672830
[16:21:13] INFO Level 0. Model # 0. Mean Score = 0.674333. Std Dev = 0.003624
[16:21:13] INFO Training Level 0 Fold # 1. Model # 1




[16:21:22] INFO Predicting Level 0. Fold # 1. Model # 1
[16:21:22] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.574757
[16:21:22] INFO Training Level 0 Fold # 2. Model # 1
[16:21:31] INFO Predicting Level 0. Fold # 2. Model # 1
[16:21:31] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.561418
[16:21:31] INFO Training Level 0 Fold # 3. Model # 1
[16:21:38] INFO Predicting Level 0. Fold # 3. Model # 1
[16:21:38] INFO Level 0. Fold # 3. Model # 1. Validation Score = 0.565262
[16:21:38] INFO Level 0. Model # 1. Mean Score = 0.567146. Std Dev = 0.005607
[16:21:38] INFO Training Level 0 Fold # 1. Model # 2
[16:21:38] INFO Predicting Level 0. Fold # 1. Model # 2
[16:21:38] INFO Level 0. Fold # 1. Model # 2. Validation Score = 0.463231
[16:21:38] INFO Training Level 0 Fold # 2. Model # 2
[16:21:38] INFO Predicting Level 0. Fold # 2. Model # 2
[16:21:38] INFO Level 0. Fold # 2. Model # 2. Validation Score = 0.456515
[16:21:38] INFO Training Level 0 Fold # 3. Model # 2
[16:21:



[16:22:23] INFO Predicting Test Level 0. Model # 0
[16:22:23] INFO Training Fulldata Level 0. Model # 1
[16:22:29] INFO Predicting Test Level 0. Model # 1
[16:22:29] INFO Training Fulldata Level 0. Model # 2
[16:22:29] INFO Predicting Test Level 0. Model # 2
[16:22:29] INFO Training Fulldata Level 0. Model # 3
[16:22:29] INFO Predicting Test Level 0. Model # 3
[16:22:29] INFO Training Fulldata Level 1. Model # 0
[16:22:49] INFO Predicting Test Level 1. Model # 0


In [54]:
# check error:
multiclass_logloss(yvalid, preds[1])

0.42372574824662645