## approaching (almost) any NLP problems 

https://www.kaggle.com/code/abhishek/approaching-almost-any-nlp-problem-on-kaggle/notebook

In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow" # jax or torch

In [2]:
import pandas as pd 
import numpy as np 

import xgboost as xgb 
from tqdm import  tqdm_gui, tqdm_pandas, tqdm_notebook, tqdm

from sklearn.svm import SVC
from sklearn import  preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import  LogisticRegression
from sklearn.model_selection import  train_test_split
from sklearn.naive_bayes import  MultinomialNB

from keras.models import  Sequential
from keras.layers import LSTM, GRU
from keras.layers import Dense, Activation, Dropout
from keras.layers import  Embedding
from keras.layers import BatchNormalization
from keras.utils import  to_categorical
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import  sequence
from keras.callbacks import  EarlyStopping

from tensorflow.keras.preprocessing.text import Tokenizer

from nltk import  word_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

In [3]:
# loading datasets

train = pd.read_csv("./input/spooky/train.csv")
test = pd.read_csv("./input/spooky/test.csv")
sample = pd.read_csv("./input/spooky/sample_submission.csv")
print(train.shape, test.shape, sample.shape)

(19579, 3) (8392, 2) (8392, 4)


In [4]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [6]:
sample.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.403494,0.287808,0.308698
1,id24541,0.403494,0.287808,0.308698
2,id00134,0.403494,0.287808,0.308698
3,id27757,0.403494,0.287808,0.308698
4,id04081,0.403494,0.287808,0.308698


In [7]:
train.author.unique()

array(['EAP', 'HPL', 'MWS'], dtype=object)

In [8]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2
    
    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [9]:
# labelEncoder

lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)
y

array([0, 1, 0, ..., 0, 0, 1])

In [10]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    train.text.values,
    y,
    stratify=y,
    random_state = 42,
    test_size=0.1,
    shuffle=True
)

In [11]:
xtrain.shape

(17621,)

In [12]:
xvalid.shape

(1958,)

In [13]:
xtrain

array(['Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head.',
       '"No," he said, "oh, no a member of my family my niece, and a most accomplished woman."',
       'The magistrate appeared at first perfectly incredulous, but as I continued he became more attentive and interested; I saw him sometimes shudder with horror; at others a lively surprise, unmingled with disbelief, was painted on his countenance.',
       ...,
       'The medical testimony spoke confidently of the virtuous character of the deceased.',
       'When we arrived, after a little rest, he led me over the house and pointed out to me the rooms which my mother had inhabited.',
       'Some were destroyed; the major part escaped by quick and well ordered movements; and danger made them careful.'],
      dtype=object)

In [14]:
xvalid

array(['Thomas" turning to me "is decidedly the best hand at a cork leg; but if you should ever want an arm, my dear fellow, you must really let me recommend you to Bishop."',
       'I struggled to reason off the nervousness which had dominion over me.',
       'My name, indeed, has been so long and so constantly before the public eye, that I am not only willing to admit the naturalness of the interest which it has everywhere excited, but ready to satisfy the extreme curiosity which it has inspired.',
       ...,
       'Such is human nature, that beauty and deformity are often closely linked.',
       'He had sought this office with eagerness, under the idea of turning his whole forces to the suppression of the privileged orders of our community.',
       'Especially was it unwise to rave of the living things that might haunt such a place; of creatures half of the jungle and half of the impiously aged city fabulous creatures which even a Pliny might describe with scepticism; things t

## building basic models

In [15]:
# text embedding

tfv = TfidfVectorizer(
    min_df=3,
    max_features=None,
    strip_accents="unicode",
    analyzer="word",
    token_pattern=r"\w{1,}",
    ngram_range=(1, 3),
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True,
    stop_words="english"
)

In [16]:
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)
xtrain_tfv.shape, xvalid_tfv.shape

((17621, 15102), (1958, 15102))

In [17]:
xtrain_tfv

<17621x15102 sparse matrix of type '<class 'numpy.float64'>'
	with 198521 stored elements in Compressed Sparse Row format>

In [18]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.570


In [19]:
# count as word embedding

ctv = CountVectorizer(
    analyzer="word",
    token_pattern=r"\w{1,}",
    ngram_range=(1,3),
    stop_words="english"
)

In [20]:
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv = ctv.transform(xtrain)
xvalid_ctv = ctv.transform(xvalid)
xtrain_ctv.shape, xvalid_ctv.shape

((17621, 400266), (1958, 400266))

In [21]:
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.527


In [22]:
# use Naive Bayes model

clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.578


In [23]:
# use Naive Bayes model

clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.485


In [24]:
# SVM needs dimensionality reduction and standardization of the data

svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)
print(xtrain_tfv.shape, xvalid_tfv.shape)
print(xtrain_svd.shape, xvalid_svd.shape)

scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)

xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

(17621, 15102) (1958, 15102)
(17621, 120) (1958, 120)


In [25]:
clf = SVC(C=1.0, probability=True)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.733


### xgboost model

In [26]:
# use xgboost --- embedding #1

clf = xgb.XGBClassifier(
    max_depth=7,
    n_estimators=200,
    colsample_bytree=0.8,
    subsample=0.8,
    nthread=10,
    learning_rate=0.1
)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_tfv.tocsc())

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.783


In [27]:
# use xgboost --- embedding #2

clf = xgb.XGBClassifier(
    max_depth=7,
    n_estimators=200,
    colsample_bytree=0.8,
    subsample=0.8,
    nthread=10,
    learning_rate=0.1
)
clf.fit(xtrain_ctv.tocsc(), ytrain)
predictions = clf.predict_proba(xvalid_ctv.tocsc())

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.774


In [28]:
# use xgboost --- embedding with dimensionality reduction

clf = xgb.XGBClassifier(
    max_depth=7,
    n_estimators=200,
    colsample_bytree=0.8,
    subsample=0.8,
    nthread=10,
    learning_rate=0.1
)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.773


In [29]:
# use xgboost --- embedding with dimensionality reduction and standadization

clf = xgb.XGBClassifier(
    max_depth=7,
    n_estimators=200,
    colsample_bytree=0.8,
    subsample=0.8,
    nthread=10,
    learning_rate=0.1
)
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 0.773


### grid search with logistic regression

In [30]:
mll_scorer = metrics.make_scorer(
    multiclass_logloss,
    greater_is_better=False,
    response_method="predict_proba" # needs_proba=True
)

In [31]:
# pipeline construction

svd = TruncatedSVD()

scl = preprocessing.StandardScaler()

lr_model = LogisticRegression()

clf = pipeline.Pipeline([
    ("svd", svd),
    ("scl", scl),
    ("lr", lr_model)
])

In [32]:
param_grid = {
    "svd__n_components": [120, 180],
    "lr__C": [0.1, 1.0, 10], 
    "lr__penalty": ["l1", "l2"]
}

In [33]:
model = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=mll_scorer,
    verbose=10,
    n_jobs=-1,
    refit=True,
    cv=2
)

model.fit(xtrain_tfv, ytrain)
print("best score: %.3f"%model.best_score_)
print("best parameter set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r"%(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 12 candidates, totalling 24 fits
[CV 1/2; 1/12] START lr__C=0.1, lr__penalty=l1, svd__n_components=120...........
[CV 2/2; 1/12] START lr__C=0.1, lr__penalty=l1, svd__n_components=120...........
[CV 1/2; 2/12] START lr__C=0.1, lr__penalty=l1, svd__n_components=180...........
[CV 1/2; 4/12] START lr__C=0.1, lr__penalty=l2, svd__n_components=180...........
[CV 2/2; 2/12] START lr__C=0.1, lr__penalty=l1, svd__n_components=180...........
[CV 1/2; 5/12] START lr__C=1.0, lr__penalty=l1, svd__n_components=120...........
[CV 2/2; 3/12] START lr__C=0.1, lr__penalty=l2, svd__n_components=120...........
[CV 1/2; 3/12] START lr__C=0.1, lr__penalty=l2, svd__n_components=120...........
[CV 1/2; 6/12] START lr__C=1.0, lr__penalty=l1, svd__n_components=180...........
[CV 2/2; 6/12] START lr__C=1.0, lr__penalty=l1, svd__n_components=180...........
[CV 2/2; 4/12] START lr__C=0.1, lr__penalty=l2, svd__n_components=180...........
[CV 2/2; 5/12] START lr__C=1.0, lr__penalty=l1, 

12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ychu/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ychu/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ychu/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/ychu/anaconda3/lib/python3.11/si

best score: -0.734
best parameter set:
	lr__C: 0.1
	lr__penalty: 'l2'
	svd__n_components: 180


In [34]:
nb_model = MultinomialNB()

clf = pipeline.Pipeline([
    ("nb", nb_model)
])

param_grid = {
    "nb__alpha": [0.001, 0.01, 0.1, 1.0, 10, 100]
}

model = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=mll_scorer,
    verbose=10,
    n_jobs=-1,
    refit=True,
    cv=2
)

model.fit(xtrain_tfv, ytrain)
print("best score: %.3f"%model.best_score_)
print("best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r"%(param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 2/2; 1/6] START nb__alpha=0.001.............................................
[CV 1/2; 1/6] START nb__alpha=0.001.............................................
[CV 2/2; 1/6] END .............nb__alpha=0.001;, score=-0.641 total time=   0.0s
[CV 1/2; 1/6] END .............nb__alpha=0.001;, score=-0.620 total time=   0.0s
[CV 1/2; 2/6] START nb__alpha=0.01..............................................
[CV 1/2; 2/6] END ..............nb__alpha=0.01;, score=-0.511 total time=   0.0s
[CV 2/2; 2/6] START nb__alpha=0.01..............................................
[CV 2/2; 2/6] END ..............nb__alpha=0.01;, score=-0.523 total time=   0.0s
[CV 2/2; 3/6] START nb__alpha=0.1...............................................
[CV 1/2; 3/6] START nb__alpha=0.1...............................................
[CV 1/2; 3/6] END ...............nb__alpha=0.1;, score=-0.489 total time=   0.0s
[CV 2/2; 3/6] END ...............nb__alpha=0.1;, 

## word vectors

In [35]:
embeddings_index = {}

f = open("./input/glove.840B.300d.txt")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    values = [i for i in values[1:] if type(i) == "float32"] # i.isnumeric()] # type(i) == "float32"
    coefs = np.asarray(values, dtype="float32")
    embeddings_index[word] = coefs
f.close()

print("FOUND %s word vectors"%len(embeddings_index))

2196017it [00:33, 65456.14it/s]

FOUND 2195884 word vectors





In [36]:
def sent2vec(s):
    words = str(s).lower().encode("utf-8").decode("utf-8")
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]

    M = []
    for w in words:
        try: 
            M.append(embeddings_index[w])
        except:
            continue
    
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != "np.ndarray":
        return np.zeros(300)
    return v / np.linalg.norm(v)


In [37]:
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

100%|██████████| 17621/17621 [00:01<00:00, 10839.18it/s]
100%|██████████| 1958/1958 [00:00<00:00, 10639.73it/s]


In [38]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

xtrain_glove.shape, xvalid_glove.shape

((17621, 300), (1958, 300))

In [44]:
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 1.088


In [45]:
clf = xgb.XGBClassifier(
    max_depth=7,
    n_estimators=200,
    colsample_bytree=0.8,
    subsample=0.8,
    nthread=10,
    learning_rate=0.1
)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print("logloss: %.3f" % multiclass_logloss(yvalid, predictions))

logloss: 1.088


## deep learning

In [46]:
scl = preprocessing.StandardScaler()

xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

xtrain_glove_scl.shape

(17621, 300)

In [48]:
# binarize the lables for the neural network

ytrain_enc = to_categorical(ytrain)
yvalid_enc = to_categorical(yvalid)
yvalid_enc

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [49]:
# create a simple 3 layer sequential neural network

model = Sequential()

model.add(Dense(300, input_dim=300, activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation="relu"))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation("softmax"))

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam"
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [51]:
model.fit(
    xtrain_glove_scl,
    y=ytrain_enc,
    batch_size=64,
    epochs=50,
    verbose=1,
    validation_data=(xvalid_glove_scl, yvalid_enc)
)

Epoch 1/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0885 - val_loss: 1.0885
Epoch 2/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0876 - val_loss: 1.0878
Epoch 3/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0870 - val_loss: 1.0878
Epoch 4/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.0895 - val_loss: 1.0878
Epoch 5/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0889 - val_loss: 1.0880
Epoch 6/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0891 - val_loss: 1.0877
Epoch 7/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1.0878 - val_loss: 1.0878
Epoch 8/50
[1m276/276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 1.0874 - val_loss: 1.0880
Epoch 9/50
[1m276/276[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x41683c310>

## tokenizer to get the embedding matrix [word --> vector]

In [52]:
# using keras tokenizer for LSTM

token = Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the squences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [57]:
len(word_index), type(word_index)

(25943, dict)

In [60]:
xtrain_pad.shape

(17621, 70)

In [61]:
xtrain_seq[0]

[29,
 560,
 8,
 1,
 5924,
 459,
 714,
 3,
 987,
 1,
 1794,
 2,
 29,
 3695,
 98,
 4,
 326,
 5,
 2545,
 2,
 3103,
 27,
 29,
 166]

In [62]:
xtrain_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   29,  560,    8,    1, 5924,  459,  714,    3,  987,
          1, 1794,    2,   29, 3695,   98,    4,  326,    5, 2545,    2,
       3103,   27,   29,  166], dtype=int32)

In [65]:
# create embedding matrix for the words we have in the dataset

embedding_matrix = np.zeros((len(word_index)+1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector:
        embedding_matrix[i, :] = embedding_vector

  if embedding_vector:
100%|██████████| 25943/25943 [00:00<00:00, 1240230.11it/s]


In [68]:
embedding_matrix.shape

(25944, 300)

In [82]:
# a simple LSTM with glove embeddings and two dense layers
from keras.initializers import Constant

model = Sequential()

layer =  Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        trainable=False,
        embeddings_initializer=Constant(embedding_matrix),
        input_shape=(max_len,)
    )
#layer.set_weights(embedding_matrix)

model.add(layer)
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")


  super().__init__(**kwargs)


In [85]:
# fit the model with early stopping callback
earlystop = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=0,
    mode="auto"
)

model.fit(
    xtrain_pad,
    y=ytrain_enc,
    batch_size=512,
    epochs=100,
    verbose=1,
    validation_data=(xvalid_pad, yvalid_enc),
    callbacks=[earlystop]
)

Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 284ms/step - loss: 1.0871 - val_loss: 1.0878
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 290ms/step - loss: 1.0864 - val_loss: 1.0877
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 284ms/step - loss: 1.0868 - val_loss: 1.0877
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 248ms/step - loss: 1.0876 - val_loss: 1.0876
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 222ms/step - loss: 1.0872 - val_loss: 1.0876
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 249ms/step - loss: 1.0873 - val_loss: 1.0876
Epoch 7/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 287ms/step - loss: 1.0865 - val_loss: 1.0876
Epoch 8/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 233ms/step - loss: 1.0865 - val_loss: 1.0876
Epoch 9/100
[1m35/35[0m [

<keras.src.callbacks.history.History at 0x419266050>

In [86]:
# a simple LSTM with glove embeddings and two dense layers
from keras.initializers import Constant

model = Sequential()

layer =  Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        trainable=False,
        embeddings_initializer=Constant(embedding_matrix),
        input_shape=(max_len,)
    )
#layer.set_weights(embedding_matrix)

model.add(layer)
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")

# fit the model with early stopping callback
earlystop = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=0,
    mode="auto"
)

model.fit(
    xtrain_pad,
    y=ytrain_enc,
    batch_size=512,
    epochs=100,
    verbose=1,
    validation_data=(xvalid_pad, yvalid_enc),
    callbacks=[earlystop]
)

Epoch 1/100


  super().__init__(**kwargs)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 517ms/step - loss: 1.0975 - val_loss: 1.0948
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 524ms/step - loss: 1.0937 - val_loss: 1.0921
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 520ms/step - loss: 1.0917 - val_loss: 1.0903
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 527ms/step - loss: 1.0903 - val_loss: 1.0892
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 525ms/step - loss: 1.0877 - val_loss: 1.0884
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 519ms/step - loss: 1.0881 - val_loss: 1.0881
Epoch 7/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 526ms/step - loss: 1.0871 - val_loss: 1.0878
Epoch 8/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 520ms/step - loss: 1.0874 - val_loss: 1.0877
Epoch 9/100
[1m35/35[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x415464cd0>

In [87]:
# a simple LSTM with glove embeddings and two dense layers
from keras.initializers import Constant

model = Sequential()

layer =  Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        trainable=False,
        embeddings_initializer=Constant(embedding_matrix),
        input_shape=(max_len,)
    )
#layer.set_weights(embedding_matrix)

model.add(layer)
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(1024, activation="relu"))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")

# fit the model with early stopping callback
earlystop = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=3,
    verbose=0,
    mode="auto"
)

model.fit(
    xtrain_pad,
    y=ytrain_enc,
    batch_size=512,
    epochs=100,
    verbose=1,
    validation_data=(xvalid_pad, yvalid_enc),
    callbacks=[earlystop]
)

Epoch 1/100


  super().__init__(**kwargs)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 687ms/step - loss: 1.0975 - val_loss: 1.0948
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 680ms/step - loss: 1.0941 - val_loss: 1.0921
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 674ms/step - loss: 1.0916 - val_loss: 1.0902
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 675ms/step - loss: 1.0894 - val_loss: 1.0890
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 673ms/step - loss: 1.0881 - val_loss: 1.0883
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 676ms/step - loss: 1.0898 - val_loss: 1.0881
Epoch 7/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 695ms/step - loss: 1.0875 - val_loss: 1.0878
Epoch 8/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 687ms/step - loss: 1.0868 - val_loss: 1.0877
Epoch 9/100
[1m35/35[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x37009ecd0>

## ensembling

In [88]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import os, sys, logging

In [90]:
logging.basicConfig(
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S",
    stream=sys.stdout
)
logger = logging.getLogger(__name__)

In [98]:
class Ensembler(object):
    def __init__(self, model_dict, num_folds=3, task_type="classification", optimize=roc_auc_score,
                lower_is_better=False, save_path=None):
        self.model_dict = model_dict
        self.levels = len(self.model_dict)
        self.num_folds = num_folds
        self.task_type = task_type
        self.optimize = optimize
        self.lower_is_better = lower_is_better
        self.save_path = save_path

        self.training_data = None
        self.test_data = None
        self.y = None
        self.lbl_enc = None
        self.y_enc = None
        self.train_prediction_dict = None
        self.test_prediction_dict = None
        self.num_classes = None
    
    def fit(self, training_data, y, lentrain):
        self.training_data = training_data
        self.y = y 

        if self.task_type == "classification":
            self.num_classes = len(np.unique(self.y))
            logger.info(f"Found {self.num_classes} classes")
            self.lbl_enc = LabelEncoder()
            self.y_enc = self.lbl_enc.fit_transform(self.y)
            kf = StratifiedKFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, self.num_classes)
        else:
            self.num_classes = -1
            self.y_enc = self.y
            kf = KFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, 1)
        
        self.train_prediciton_dict = {}
        for level in range(self.levels):
            self.train_prediction_dict[level] = np.zeros(
                (train_prediction_shape[0],
                 train_prediction_shape[1] * len(self.model_dict[level]))
            )
    
        for level in range(self.levels):
            if level==0:
                temp_train = self.training_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
            
            for model_num, model in enumerate(self.model_dict[level]):
                validation_scores = []
                foldnum = 1
                for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
                    logger.info(f"Training level {level} Fold # {foldnum}. model {model_num}")

                    if level != 0:
                        l_training_data = temp_train[train_index]
                        l_validation_data = temp_train[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    else:
                        l0_training_data = temp_train[0][model_num]
                        if type(l0_training_data) == list:
                            l_training_data = [x[train_index] for x in l0_training_data]
                            l_validation_data = [x[valid_index] for x in l0_training_data]
                        else:
                            l_training_data = l0_training_data[train_index]
                            l_validation_data = l0_training_data[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    logger.info(f"predicting level {level}. Fold {foldnum}. Model {model_num}")

                    if self.task_type == "classification":
                        temp_train_predictions = model.predict_proba(l_validation_data)
                        self.train_prediction_dict[level][valid_index,
                        (model_num * self.num_classes):(model_num*self.num_classes) + self.num_classes] = temp_train_predictions
                    else:
                        temp_train_predictions = model.predict(l_validation_data)
                        self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
                    
                    validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
                    validation_scores.append(validation_score)
                    logger.info(f"level {level}, Fold {foldnum}. Model {model_num}. Validation score={validation_score}")
                    foldnum += 1
                avg_score = np.mean(validation_scores)
                std_score = np.std(validation_scores)
                logger.info(f"level {level}. Model {model_num}. Mean score={avg_score}. Std Dev={std_score}")
            logger.info(f"saving predictions for level {level}")
            train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
            train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_"+str(level)+".csv"),
                                        index=False, header=None)
        return self.train_prediction_dict
    
    def predict(self, test_data, lentest):
        self.test_data = test_data
        if self.task_type == "classification":
            test_prediction_shape = (lentest, self.num_classes)
        else:
            test_prediction_shape = (lentest, 1)
        
        self.test_prediction_dict = {}
        for level in range(self.levels):
            self.test_prediction_dict[level] = np.zeros((
                test_prediction_shape[0],
                test_prediction_shape[1] * len(self.model_dict[level])
            ))
        self.test_data = test_data
        
        for level in range(self.levels):
            if level==0:
                temp_train = self.training_data
                temp_test = self.test_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
                temp_test = self.test_prediction_dict[level - 1]
            
            for model_num, model in enumerate(self.model_dict[level]):
                logger.info(f"training fulldata level {level}. model {model}")
                if level == 0:
                    model.fit(temp_train[0][model_num], self.y_enc)
                else:
                    model.fit(temp_train, self.y_enc)
                
                logger.info(f"predicting test level {level}. Model {model_num}")
                if self.task_type == "classification":
                    if level==0:
                        temp_test_predictions = model.predict_proba(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict_proba(temp_test)
                    
                    self.test_prediction_dict[level][:, (model_num*self.num_classes): (model_num*self.num_classes)+self.num_classes] = temp_test_predictions
                else:
                    if level ==0:
                        temp_test_predictions = model.predict(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict(temp_test)
                    self.test_prediction_dict[level][:, model_num] = temp_test_predictions
            test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
            test_predictions_df.to_csv(
                os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
                index=False, header=None
            )
        return self.test_predictions_dict

In [92]:
train_data_dict = {0: [xtrain_tfv, xtrain_ctv, xtrain_tfv, xtrain_ctv], 1:[xtrain_glove]}
test_data_dict = {0: [xvalid_tfv, xvalid_ctv, xvalid_tfv, xvalid_ctv], 1:[xvalid_glove]}

In [93]:
model_dict = {
    0: [LogisticRegression(), LogisticRegression(), MultinomialNB(alpha=0.1), MultinomialNB()],
    1: [xgb.XGBClassifier(n_estimator=120, max_depth=7)]
}

In [None]:
ens = Ensembler(
    model_dict = model_dict,
    num_folds = 3,
    task_type = "classification",
    optimize = multiclass_logloss,
    lower_is_better = True,
    save_path = ''
)
ens.fit(train_data_dict, ytrain, lentrain=xtrain_glove.shape[0])
preds = ens.predict(test_data_dict, lentest=xvalid_glove.shape[0])

In [None]:
multiclass_logloss(yvalid, preds[1])