In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
# Import only in CPU mode
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
import mlflow
from mlflow.tracking import MlflowClient
import time 

In [4]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.binary import BinaryEncoder

from sklearn.metrics import confusion_matrix, matthews_corrcoef, precision_score, recall_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
RANDOM_SEED = 22

In [11]:
# mlflow.autolog()

In [12]:
train_path = "train.csv"
test_path =  "test.csv"
sample_path = "sample_submission.csv"

In [13]:
data = pd.read_csv(train_path)
# test = pd.read_csv(test_path)
# sample = pd.read_csv(sample_path)

In [14]:
data['essay_text'] = data['essay_id'].apply(lambda x: open(f"train/{x}.txt").read())

In [15]:
target_labels_mapping = {"Ineffective": 0, "Adequate":1, "Effective":2}
data.discourse_effectiveness = data.discourse_effectiveness.map(target_labels_mapping)

In [16]:
data.discourse_effectiveness.value_counts()

1    20977
2     9326
0     6462
Name: discourse_effectiveness, dtype: int64

In [17]:
X, Y = data.drop('discourse_effectiveness', axis=1, inplace = False), data.discourse_effectiveness

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state=RANDOM_SEED, stratify=data.discourse_effectiveness)

In [19]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(29412, 5) (29412,) (7353, 5) (7353,)


In [20]:
X_train

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text
8629,7d7fb0ac2edb,9C480C68AA9B,"Instead of laying on the couch, eating, sleepi...",Evidence,If the Summer is plagued with more work we sho...
10274,96517470c123,B8B5B46DA523,like some simplified electronics made of silic...,Evidence,No one has ever landed on venus so the author ...
4293,bb394ddc5bb1,4C51280DE2A8,"Second, now to the conspiracy theorists, they ...",Counterclaim,"First of all, NASA only gets their information..."
2443,d85fb9fc13c9,2D08A68E70CD,Third example has pathos catching peoples feel...,Claim,I think that the author describes how technolg...
16589,2f5adb92fe7d,1EFA2916E5A8,it would also in the world of to day make him ...,Claim,"Dear Principle,\n\nI personally do not think s..."
...,...,...,...,...,...
3015,6fb01a8a829a,37FC9DB2D1DB,it's their summer.,Claim,"When assigned a project during summer break, d..."
26587,7551a7b008f5,A4C9096A123B,I think people should be able to choose who t...,Evidence,"Dear, State Sentor\n\nI think the electoral co..."
19477,0f5d0b88c638,44DEA88FDD83,But you see there is up side to using the Elec...,Counterclaim,"Dear Floridas state senator, I am righting thi..."
30083,2d7b19e2991b,D786FC589E93,but we should at least get a vote on like new ...,Rebuttal,"Dear senator,\n\nGetting ride of the Electoral..."


In [21]:
gc.collect()

996

#### LogReg

No scaling
Min loss 	0.7707357995019517 <br>
Max loss 	0.7939273816408711 <br>
Mean loss	0.7830213020862644 <br>



Scaled w/ MaxAbsScaler
Min loss 	0.8006261104803448 <br>
Max loss 	0.8303430962728789 <br>
Mean loss	0.8157244737733116 <br>


In [99]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1


2022/08/10 19:33:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '638d321d45b74353a961073a0dfcafa9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7869757143697128
................................................................................

Fold # 2


2022/08/10 19:34:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ae5b9e6c135048d5aa33a40fa5259b82', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7804193422274297
................................................................................

Fold # 3


2022/08/10 19:34:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6e4687e4886b4058878ca20c93e3abff', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7939273816408711
................................................................................

Fold # 4


2022/08/10 19:34:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'aa7c12f16ef245a3b53bf7a93ae34c35', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7830482726913571
................................................................................

Fold # 5


2022/08/10 19:34:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '57204076de6b4808bb46ba8d7c5a7bea', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7707357995019517
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7707357995019517
Max loss 	0.7939273816408711
Mean loss	0.7830213020862644


#### RF

Min loss 	0.8211464183621852 <br>
Max loss 	0.8326483703178386 <br>
Mean loss	0.8274193801826459 <br>

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = RandomForestClassifier(n_estimators=1000)
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1
	Log loss: 0.8179767168529476
................................................................................

Fold # 2


#### XGBoost
Min loss 	0.769898167914014 <br>
Max loss 	0.7861944754332157 <br>
Mean loss	0.7808887629128117 <br>

In [95]:
from xgboost import XGBClassifier

In [96]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = XGBClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1


2022/08/10 19:26:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '85b98ebfe07c4f72811640207eac6e0f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7831609343908684
................................................................................

Fold # 2


2022/08/10 19:27:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2d0aa38649744f759bd89ba849c8939c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7817585589293724
................................................................................

Fold # 3


2022/08/10 19:28:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '17153cd8e63a4ec899b8ea9934ed18bd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7834316778965883
................................................................................

Fold # 4


2022/08/10 19:28:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f2809c6dbda04f99977e906af55cc29d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7861944754332157
................................................................................

Fold # 5


2022/08/10 19:29:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9eecbea3e7c749559c4836195ed9217c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.769898167914014
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.769898167914014
Max loss 	0.7861944754332157
Mean loss	0.7808887629128117


#### LGBM
Min loss 	0.7565350176859863 <br>
Max loss 	0.7765817628486158 <br>
Mean loss	0.7692048809033425 <br>

In [97]:
from lightgbm import LGBMClassifier

In [98]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = LGBMClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1
	Log loss: 0.7698034071294326
................................................................................

Fold # 2
	Log loss: 0.7695595489450545
................................................................................

Fold # 3
	Log loss: 0.7765817628486158
................................................................................

Fold # 4
	Log loss: 0.773544667907623
................................................................................

Fold # 5
	Log loss: 0.7565350176859863
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7565350176859863
Max loss 	0.7765817628486158
Mean loss	0.7692048809033425


In [None]:
Y_pred = model.predict_proba(X_test)

In [None]:
Y_pred.shape

(7353, 3)

In [None]:
loss = log_loss(y_true = Y_test, y_pred = Y_pred)
loss 

0.7759086348638186

#### HistGradientBoosting
Min loss 	0.7589378423671769 <br>
Max loss 	0.7814936649694296 <br>
Mean loss	0.7706771662759752 <br>

In [17]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [18]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    x_train, x_valid = x_train.todense(), x_valid.todense()

    model = HistGradientBoostingClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1




	Log loss: 0.7725146625309408
................................................................................

Fold # 2




	Log loss: 0.7672879132724585
................................................................................

Fold # 3




	Log loss: 0.7803359314651005
................................................................................

Fold # 4




	Log loss: 0.7760577902829843
................................................................................

Fold # 5




	Log loss: 0.7572928859563774
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7572928859563774
Max loss 	0.7803359314651005
Mean loss	0.7706978367015722




#### NB

In [23]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB

In [26]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    x_train, x_valid = x_train.todense(), x_valid.todense()

    model = MultinomialNB()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1




	Log loss: 1.0052641991904334
................................................................................

Fold # 2




	Log loss: 0.9989523898046153
................................................................................

Fold # 3




	Log loss: 1.0012554263271314
................................................................................

Fold # 4




	Log loss: 1.00147215303088
................................................................................

Fold # 5




	Log loss: 0.9826888223621003
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.9826888223621003
Max loss 	1.0052641991904334
Mean loss	0.9979265981430322




w/ more data

In [28]:
losses = []
x_train, y_train = X_train, Y_train
x_valid, y_valid = X_test, Y_test

y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.essay_text.drop_duplicates())

vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

ohe = OneHotEncoder()
ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

# In case it is necessary
# scaler = MaxAbsScaler()
# x_train = scaler.fit_transform(x_train)
# x_valid = scaler.transform(x_valid)

x_train, x_valid = x_train.todense(), x_valid.todense()

model = MultinomialNB()
model.fit(x_train, y_train)

prob_predictions = model.predict_proba(x_valid)
lloss = log_loss(y_valid, prob_predictions)
losses.append(lloss)

print(f"\tLog loss: {lloss}")
print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")



	Log loss: 0.9541224343739063
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.9541224343739063
Max loss 	0.9541224343739063
Mean loss	0.9541224343739063




#### Transformers

In [15]:
from transformers import TFAutoModel
from transformers import AutoTokenizer

In [20]:
from transformers import TFAutoModelForSequenceClassification

In [17]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 14.0kB/s]
Downloading vocab.txt: 100%|██████████| 208k/208k [00:00<00:00, 434kB/s]  
Downloading tokenizer.json: 100%|██████████| 426k/426k [00:00<00:00, 714kB/s] 


In [14]:
model = TFAutoModel.from_pretrained('bert-base-cased')

Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 276kB/s]
2022-08-14 14:24:35.931994: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Downloading tf_model.h5: 100%|██████████| 502M/502M [00:17<00:00, 30.0MB/s] 
2022-08-14 14:24:55.035998: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-14 14:24:55.039555: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-14 14:24:55.039717: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read fr

In [21]:
classification_head = TFAutoModelForSequenceClassification