In [70]:
import pandas as pd
import numpy as np
import gc

In [71]:
import mlflow
from mlflow.tracking import MlflowClient
import time 

In [92]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.binary import BinaryEncoder

from sklearn.metrics import confusion_matrix, matthews_corrcoef, precision_score, recall_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [73]:
from sklearn.linear_model import LogisticRegression

In [74]:
RANDOM_SEED = 22

In [75]:
# mlflow.autolog()

2022/08/10 16:51:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [76]:
train_path = "train.csv"
test_path =  "test.csv"
sample_path = "sample_submission.csv"

In [77]:
data = pd.read_csv(train_path)
# test = pd.read_csv(test_path)
# sample = pd.read_csv(sample_path)

In [78]:
data['essay_text'] = data['essay_id'].apply(lambda x: open(f"train/{x}.txt").read())

In [79]:
target_labels_mapping = {"Ineffective": 0, "Adequate":1, "Effective":2}
data.discourse_effectiveness = data.discourse_effectiveness.map(target_labels_mapping)

In [80]:
data.discourse_effectiveness.value_counts()

1    20977
2     9326
0     6462
Name: discourse_effectiveness, dtype: int64

In [81]:
X, Y = data.drop('discourse_effectiveness', axis=1, inplace = False), data.discourse_effectiveness

In [82]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state=RANDOM_SEED, stratify=data.discourse_effectiveness)

In [83]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(29412, 5) (29412,) (7353, 5) (7353,)


In [85]:
X_train

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text
8629,7d7fb0ac2edb,9C480C68AA9B,"Instead of laying on the couch, eating, sleepi...",Evidence,If the Summer is plagued with more work we sho...
10274,96517470c123,B8B5B46DA523,like some simplified electronics made of silic...,Evidence,No one has ever landed on venus so the author ...
4293,bb394ddc5bb1,4C51280DE2A8,"Second, now to the conspiracy theorists, they ...",Counterclaim,"First of all, NASA only gets their information..."
2443,d85fb9fc13c9,2D08A68E70CD,Third example has pathos catching peoples feel...,Claim,I think that the author describes how technolg...
16589,2f5adb92fe7d,1EFA2916E5A8,it would also in the world of to day make him ...,Claim,"Dear Principle,\n\nI personally do not think s..."
...,...,...,...,...,...
3015,6fb01a8a829a,37FC9DB2D1DB,it's their summer.,Claim,"When assigned a project during summer break, d..."
26587,7551a7b008f5,A4C9096A123B,I think people should be able to choose who t...,Evidence,"Dear, State Sentor\n\nI think the electoral co..."
19477,0f5d0b88c638,44DEA88FDD83,But you see there is up side to using the Elec...,Counterclaim,"Dear Floridas state senator, I am righting thi..."
30083,2d7b19e2991b,D786FC589E93,but we should at least get a vote on like new ...,Rebuttal,"Dear senator,\n\nGetting ride of the Electoral..."


In [93]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    scaler = MaxAbsScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1


2022/08/10 17:11:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ea6d8e80ed824168ae8bf2b1530bd1d1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.8141559689855703
................................................................................

Fold # 2


2022/08/10 17:11:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1ebcd6b9293e4c328a826db2f586387d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.8126399232504511
................................................................................

Fold # 3


2022/08/10 17:11:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'be94835c9ea04d3b87b5b592de022bde', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.8303430962728789
................................................................................

Fold # 4


2022/08/10 17:11:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a94f2a916a7649adb1b2c395a2ae7bbc', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.8208572698773121
................................................................................

Fold # 5


2022/08/10 17:12:02 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7650ac1672d44ab68bca3f2755ba7212', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.8006261104803448
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.8006261104803448
Max loss 	0.8303430962728789
Mean loss	0.8157244737733116


In [91]:
x_train

<23530x28446 sparse matrix of type '<class 'numpy.float64'>'
	with 762806 stored elements in Compressed Sparse Row format>

2022/08/10 16:02:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4971bcd4368543e287041d30dd9d3c12', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
Y_pred = model.predict_proba(X_test)

In [None]:
Y_pred.shape

(7353, 3)

In [None]:
loss = log_loss(y_true = Y_test, y_pred = Y_pred)
loss 

0.7759086348638186