In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
# # Import only in CPU mode
# from sklearnex import patch_sklearn
# patch_sklearn()

In [3]:
import mlflow
from mlflow.tracking import MlflowClient
import time 

In [4]:
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.binary import BinaryEncoder

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.metrics import log_loss, accuracy_score, matthews_corrcoef
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

In [5]:
from sklearn.metrics import confusion_matrix, matthews_corrcoef, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold 

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

In [7]:
from spylls.hunspell import Dictionary
from string import punctuation
import re
import contractions
from nltk import word_tokenize
from nltk.corpus import stopwords
from itertools import chain

In [8]:
from hyphenate import hyphenate_word
from nltk.tokenize import sent_tokenize

In [67]:
import json

In [9]:
RANDOM_SEED = 22

In [10]:
# mlflow.autolog()

In [11]:
train_path = "train.csv"
test_path =  "test.csv"
sample_path = "sample_submission.csv"

In [12]:
data = pd.read_csv(train_path)
# test = pd.read_csv(test_path)
# sample = pd.read_csv(sample_path)

In [13]:
data['essay_text'] = data['essay_id'].apply(lambda x: open(f"train/{x}.txt").read())

In [14]:
target_labels_mapping = {"Ineffective": 0, "Adequate":1, "Effective":2}
data.discourse_effectiveness = data.discourse_effectiveness.map(target_labels_mapping)

In [15]:
data.discourse_effectiveness.value_counts()

1    20977
2     9326
0     6462
Name: discourse_effectiveness, dtype: int64

In [16]:
X, Y = data.drop('discourse_effectiveness', axis=1, inplace = False), data.discourse_effectiveness

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20, random_state=RANDOM_SEED, stratify=data.discourse_effectiveness)

In [18]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(29412, 5) (29412,) (7353, 5) (7353,)


In [19]:
X_train

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text
8629,7d7fb0ac2edb,9C480C68AA9B,"Instead of laying on the couch, eating, sleepi...",Evidence,If the Summer is plagued with more work we sho...
10274,96517470c123,B8B5B46DA523,like some simplified electronics made of silic...,Evidence,No one has ever landed on venus so the author ...
4293,bb394ddc5bb1,4C51280DE2A8,"Second, now to the conspiracy theorists, they ...",Counterclaim,"First of all, NASA only gets their information..."
2443,d85fb9fc13c9,2D08A68E70CD,Third example has pathos catching peoples feel...,Claim,I think that the author describes how technolg...
16589,2f5adb92fe7d,1EFA2916E5A8,it would also in the world of to day make him ...,Claim,"Dear Principle,\n\nI personally do not think s..."
...,...,...,...,...,...
3015,6fb01a8a829a,37FC9DB2D1DB,it's their summer.,Claim,"When assigned a project during summer break, d..."
26587,7551a7b008f5,A4C9096A123B,I think people should be able to choose who t...,Evidence,"Dear, State Sentor\n\nI think the electoral co..."
19477,0f5d0b88c638,44DEA88FDD83,But you see there is up side to using the Elec...,Counterclaim,"Dear Floridas state senator, I am righting thi..."
30083,2d7b19e2991b,D786FC589E93,but we should at least get a vote on like new ...,Rebuttal,"Dear senator,\n\nGetting ride of the Electoral..."


## Text features only

### Classic ML

In [39]:
from sklearn.linear_model import LogisticRegression

120

#### LogReg

No scaling
Min loss 	0.7707357995019517 <br>
Max loss 	0.7939273816408711 <br>
Mean loss	0.7830213020862644 <br>



Scaled w/ MaxAbsScaler
Min loss 	0.8006261104803448 <br>
Max loss 	0.8303430962728789 <br>
Mean loss	0.8157244737733116 <br>


In [99]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = LogisticRegression()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1


2022/08/10 19:33:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '638d321d45b74353a961073a0dfcafa9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7869757143697128
................................................................................

Fold # 2


2022/08/10 19:34:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ae5b9e6c135048d5aa33a40fa5259b82', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7804193422274297
................................................................................

Fold # 3


2022/08/10 19:34:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6e4687e4886b4058878ca20c93e3abff', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7939273816408711
................................................................................

Fold # 4


2022/08/10 19:34:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'aa7c12f16ef245a3b53bf7a93ae34c35', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7830482726913571
................................................................................

Fold # 5


2022/08/10 19:34:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '57204076de6b4808bb46ba8d7c5a7bea', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


	Log loss: 0.7707357995019517
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7707357995019517
Max loss 	0.7939273816408711
Mean loss	0.7830213020862644


#### RF

Min loss 	0.8211464183621852 <br>
Max loss 	0.8326483703178386 <br>
Mean loss	0.8274193801826459 <br>

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = RandomForestClassifier(n_estimators=1000)
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1
	Log loss: 0.8179767168529476
................................................................................

Fold # 2
	Log loss: 0.8121509848801739
................................................................................

Fold # 3
	Log loss: 0.8203104172192276
................................................................................

Fold # 4
	Log loss: 0.8119259143806231
................................................................................

Fold # 5
	Log loss: 0.807629720255935
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.807629720255935
Max loss 	0.8203104172192276
Mean loss	0.8139987507177814


#### ExtraTrees
Min loss 	0.8255913481053635 <br>
Max loss 	0.8682496106013873 <br>
Mean loss	0.8445679909453429

In [17]:
from sklearn.ensemble import ExtraTreesClassifier

In [19]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = ExtraTreesClassifier(n_jobs=-1)
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1
	Log loss: 0.8607272202243996
................................................................................

Fold # 2
	Log loss: 0.8360894983524675
................................................................................

Fold # 3
	Log loss: 0.8682496106013873
................................................................................

Fold # 4
	Log loss: 0.8255913481053635
................................................................................

Fold # 5
	Log loss: 0.8321822774430964
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.8255913481053635
Max loss 	0.8682496106013873
Mean loss	0.8445679909453429


In [20]:
from sklearn.ensemble import IsolationForest

In [None]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = IsolationForest(n_jobs=-1)
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")

#### XGBoost
Min loss 	0.769898167914014 <br>
Max loss 	0.7861944754332157 <br>
Mean loss	0.7808887629128117 <br>

In [95]:
from xgboost import XGBClassifier

In [96]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = XGBClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1


2022/08/10 19:26:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '85b98ebfe07c4f72811640207eac6e0f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7831609343908684
................................................................................

Fold # 2


2022/08/10 19:27:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2d0aa38649744f759bd89ba849c8939c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7817585589293724
................................................................................

Fold # 3


2022/08/10 19:28:04 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '17153cd8e63a4ec899b8ea9934ed18bd', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7834316778965883
................................................................................

Fold # 4


2022/08/10 19:28:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f2809c6dbda04f99977e906af55cc29d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.7861944754332157
................................................................................

Fold # 5


2022/08/10 19:29:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9eecbea3e7c749559c4836195ed9217c', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow






	Log loss: 0.769898167914014
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.769898167914014
Max loss 	0.7861944754332157
Mean loss	0.7808887629128117


#### LGBM
Min loss 	0.7565350176859863 <br>
Max loss 	0.7765817628486158 <br>
Mean loss	0.7692048809033425 <br>

In [40]:
from lightgbm import LGBMClassifier

In [98]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = LGBMClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1
	Log loss: 0.7698034071294326
................................................................................

Fold # 2
	Log loss: 0.7695595489450545
................................................................................

Fold # 3
	Log loss: 0.7765817628486158
................................................................................

Fold # 4
	Log loss: 0.773544667907623
................................................................................

Fold # 5
	Log loss: 0.7565350176859863
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7565350176859863
Max loss 	0.7765817628486158
Mean loss	0.7692048809033425


In [None]:
Y_pred = model.predict_proba(X_test)

In [None]:
Y_pred.shape

(7353, 3)

In [None]:
loss = log_loss(y_true = Y_test, y_pred = Y_pred)
loss 

0.7759086348638186

In [42]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = LGBMClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1




	Log loss: 0.768475009212139
................................................................................

Fold # 2




	Log loss: 0.765759454909087
................................................................................

Fold # 3




	Log loss: 0.7766540270328358
................................................................................

Fold # 4




	Log loss: 0.7727770030010869
................................................................................

Fold # 5
	Log loss: 0.7590620222984978
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7590620222984978
Max loss 	0.7766540270328358
Mean loss	0.7685455032907293




In [41]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer(ngram_range=(1,3))
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    model = LGBMClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1




	Log loss: 0.7719784893139021
................................................................................

Fold # 2




	Log loss: 0.7659387707507819
................................................................................

Fold # 3




	Log loss: 0.7764218927406688
................................................................................

Fold # 4




	Log loss: 0.7770481802053392
................................................................................

Fold # 5
	Log loss: 0.7606461736088924
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7606461736088924
Max loss 	0.7770481802053392
Mean loss	0.7704067013239169




Apparently, slightly worse than n_gram_range(1,1)

#### HistGradientBoosting
Min loss 	0.7589378423671769 <br>
Max loss 	0.7814936649694296 <br>
Mean loss	0.7706771662759752 <br>

In [17]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [18]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    x_train, x_valid = x_train.todense(), x_valid.todense()

    model = HistGradientBoostingClassifier()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1




	Log loss: 0.7725146625309408
................................................................................

Fold # 2




	Log loss: 0.7672879132724585
................................................................................

Fold # 3




	Log loss: 0.7803359314651005
................................................................................

Fold # 4




	Log loss: 0.7760577902829843
................................................................................

Fold # 5




	Log loss: 0.7572928859563774
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.7572928859563774
Max loss 	0.7803359314651005
Mean loss	0.7706978367015722




#### NB

In [23]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB

In [26]:
stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
losses = []
for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
    print(f"\nFold # {idx + 1}")

    x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
    x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

    y_train = y_train.to_numpy()
    y_valid = y_valid.to_numpy()

    vectorizer = TfidfVectorizer()
    vectorizer.fit(X_train.essay_text.drop_duplicates())

    vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
    vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

    ohe = OneHotEncoder()
    ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
    ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

    x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
    x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

    # In case it is necessary
    # scaler = MaxAbsScaler()
    # x_train = scaler.fit_transform(x_train)
    # x_valid = scaler.transform(x_valid)

    x_train, x_valid = x_train.todense(), x_valid.todense()

    model = MultinomialNB()
    model.fit(x_train, y_train)

    prob_predictions = model.predict_proba(x_valid)
    lloss = log_loss(y_valid, prob_predictions)
    losses.append(lloss)

    print(f"\tLog loss: {lloss}")
    print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")


Fold # 1




	Log loss: 1.0052641991904334
................................................................................

Fold # 2




	Log loss: 0.9989523898046153
................................................................................

Fold # 3




	Log loss: 1.0012554263271314
................................................................................

Fold # 4




	Log loss: 1.00147215303088
................................................................................

Fold # 5




	Log loss: 0.9826888223621003
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.9826888223621003
Max loss 	1.0052641991904334
Mean loss	0.9979265981430322




w/ more data

In [28]:
losses = []
x_train, y_train = X_train, Y_train
x_valid, y_valid = X_test, Y_test

y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train.essay_text.drop_duplicates())

vectorized_discourse_text_train = vectorizer.transform(x_train.discourse_text)
vectorized_discourse_text_test = vectorizer.transform(x_valid.discourse_text)

ohe = OneHotEncoder()
ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train])
x_valid = sparse.hstack([vectorized_discourse_text_test, ohe_discourse_type_test])

# In case it is necessary
# scaler = MaxAbsScaler()
# x_train = scaler.fit_transform(x_train)
# x_valid = scaler.transform(x_valid)

x_train, x_valid = x_train.todense(), x_valid.todense()

model = MultinomialNB()
model.fit(x_train, y_train)

prob_predictions = model.predict_proba(x_valid)
lloss = log_loss(y_valid, prob_predictions)
losses.append(lloss)

print(f"\tLog loss: {lloss}")
print(80*'.')

print(80*'-')

print(f"Min loss \t{min(losses)}")
print(f"Max loss \t{max(losses)}")
print(f"Mean loss\t{np.mean(losses)}")



	Log loss: 0.9541224343739063
................................................................................
--------------------------------------------------------------------------------
Min loss 	0.9541224343739063
Max loss 	0.9541224343739063
Mean loss	0.9541224343739063




### Transformers

In [16]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from transformers import TFAutoModelForSequenceClassification

In [18]:
MAX_LEN = 512
BATCH_SIZE = 16

In [19]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [20]:
tokenizer.sep_token

'[SEP]'

In [21]:
def tokenize_data(sentences):
    return tokenizer(sentences, truncation = True, add_special_tokens=True, padding = True)

In [28]:
X_train_ = X_train['discourse_text'].str.map(tokenize_data).tolist()
X_test_ = X_test['discourse_text'].map(tokenize_data).tolist()
Y_train_= Y_train.tolist()
Y_test_ = Y_test.tolist()

AttributeError: 'StringMethods' object has no attribute 'map'

In [23]:
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels = 3)

2022-08-17 15:15:27.139028: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-08-17 15:15:28.309711: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-17 15:15:28.312927: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-17 15:15:28.313036: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-17 15:15:28.313349: I tensorflow/core/

In [24]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 108,312,579
Trainable params: 108,312,579
Non-trainable params: 0
_________________________________________________________________


In [25]:
import tensorflow as tf

In [26]:
model.compile()

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [27]:
model.fit(X_train_, Y_train_)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'transformers.tokenization_utils_base.BatchEncoding'>"}), (<class 'list'> containing values of types {"<class 'int'>"})

In [53]:
x_train_.shape

(29412,)

## Shallow features (word-count, syllables, sentences, etc.)

In [20]:

STOPWORDS = set(stopwords.words('english'))
DICTIONARY = Dictionary.from_files('en_US')
PUNCTUATIONS = set(list(punctuation))
PUNCTUATIONS.update("`")
PUNCTUATIONS.update("'")
CONTRACTIONS = contractions.contractions_dict
CONTRACTIONS["It'll"] = "It will"
CONTRACTIONS = {key.capitalize(): value for key, value in CONTRACTIONS.items()}


def check_word(token):
    if DICTIONARY.lookup(token):
        return 1
    else:
        return 0

def replace_contractions(text):

    
    for key, value in CONTRACTIONS.items():
        # Upper-case
        text = text.replace(key, value)
        # Lower-case 
        text = text.replace(key.lower(), value.lower())

    # Remove possesives as well 
    text = text.replace("'s", "")

    # Remove unnecessary whitespaces
    text = re.sub(' +', ' ', text)
    
    return text 

NOT_MISTAKES = ['landform', 'driverless']

def remove_empty_tokens(list_of_tokens):
    list_of_tokens = list(filter(None, list_of_tokens))
    return list_of_tokens

def check_in_dictionary(sentences):

    # If the contraction can be removed, than it means it's correct; consequently, only possibly incorrect words remain

    tokenized_sentences = word_tokenize(sentences)
    correct_tokens = [check_word(token) if (token not in PUNCTUATIONS) and (token not in NOT_MISTAKES)  else 1 for token in tokenized_sentences]

    return correct_tokens

def get_incorrect_indices(correct_word_list):
    return [idx for idx, value in enumerate(correct_word_list) if not value]

def get_incorrect_words(words_list, indices):
    return [words_list[idx] for idx in indices if words_list[idx] ]


In [21]:
def find_relative_position(argument, essay):
    essay_length = len(essay)
    split_essay = essay.split(argument.strip())
    
    if split_essay[0] == essay:
        return (0,0)

    if split_essay ==  ['', '']:
        return (0,1)
    
    if split_essay[0] == '':
        return (0, (essay_length - len(split_essay[1])) / essay_length)
    
    try:
        if split_essay[1] == '':  
            return ((essay_length - len(split_essay[0])) / essay_length, 1)
    except:
        return None
    else:
        return (
            len(split_essay[0]) / essay_length,
            (len(split_essay[0]) + len(argument)) / essay_length)

In [22]:
X_train['sentences'] = X_train.discourse_text.map(sent_tokenize)
X_train['words'] = X_train.sentences.apply(lambda sentences: list(chain(*[word_tokenize(sentence) for sentence in sentences])))
X_train['words'] = X_train.words.apply(lambda tokens: [token for token in tokens if (token not in PUNCTUATIONS and token != '')])
X_train['syllables'] = X_train.words.apply(lambda tokens: list(chain(*[hyphenate_word(token) for token in tokens])))

In [23]:
X_train['text_'] = X_train.discourse_text.map(replace_contractions)
X_train['tokenized_sentences'] = X_train.text_.map(word_tokenize)
X_train['correct_list'] = X_train.text_.map(check_in_dictionary)
X_train['incorrect_indices'] = X_train.correct_list.map(get_incorrect_indices)
X_train['incorrect_words'] = X_train.apply(lambda x: get_incorrect_words(x.tokenized_sentences, x.incorrect_indices), axis = 1)

X_train['incorrect_words_len'] = X_train.incorrect_words.map(len)
X_train['relative_position'] = X_train.apply(lambda x: find_relative_position(argument = x.discourse_text, essay = x.essay_text), axis = 1)
X_train[['rp_start','rp_end']] = pd.DataFrame(X_train['relative_position'].tolist(), index = X_train.index)

X_train['sent_count'] = X_train.sentences.map(len)
X_train['word_count'] = X_train.words.map(len)
X_train['syll_count'] = X_train.syllables.map(len)

X_train['words_per_sentences'] = X_train['sent_count'] / X_train['word_count']
X_train['syll_per_sentences'] = X_train['sent_count'] / X_train['syll_count'] 
X_train['syll_per_words'] = X_train['word_count'] / X_train['syll_count']


In [61]:
lr = LogisticRegression(max_iter = 1000)
xgb = XGBClassifier()
lgbm = LGBMClassifier()
extra = ExtraTreesClassifier(n_jobs=-1)
hgbm = HistGradientBoostingClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier(n_jobs = -1)

In [83]:
models = [lr, dt, rf, xgb, lgbm, extra]
models_2 = [hgbm]

In [84]:
metrics_dict = {}

ohe = OneHotEncoder()
text_vectorizer = TfidfVectorizer()
standard_scaler = StandardScaler()


for model in models:
    model_name = type(model).__name__
    
    print()
    print(80*'*')
    print(80*'*')

    print(f"{model_name}")
    print()
    print(model.__str__())
    
    stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
    losses = []

    total_mcc = 0
    total_acc = 0
    for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
        print(f"\nFold # {idx + 1}")

        x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
        x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

        y_train = y_train.to_numpy()
        y_valid = y_valid.to_numpy()

        text_vectorizer.fit(x_train.essay_text.drop_duplicates())

        vectorized_discourse_text_train = text_vectorizer.transform(x_train.discourse_text)
        vectorized_discourse_text_valid = text_vectorizer.transform(x_valid.discourse_text)

        ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
        ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

        incorrect_count_train = standard_scaler.fit_transform(x_train['incorrect_words_len'].to_numpy().reshape(-1, 1))
        incorrect_count_valid = standard_scaler.transform(x_valid['incorrect_words_len'].to_numpy().reshape(-1, 1))

        rp_start_train = x_train['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_train = x_train['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_train = x_train['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_train = x_train['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_train = x_train['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        
        rp_start_valid = x_valid['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_valid = x_valid['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_valid = x_valid['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_valid = x_valid['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_valid = x_valid['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)

        x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train, incorrect_count_train, rp_start_train, rp_end_train, wps_train, sps_train, spw_train])
        x_valid = sparse.hstack([vectorized_discourse_text_valid, ohe_discourse_type_test, incorrect_count_valid,  rp_start_valid, rp_end_valid, wps_valid, sps_valid, spw_valid])
        
        if model_name == 'HistGradientBoostingClassifier':
            x_train = x_train.todense()
            x_valid = x_valid.todense()

        model.fit(x_train, y_train)

        prob_predictions = model.predict_proba(x_valid)
        actual_pred = model.predict(x_valid)

        lloss = log_loss(y_valid, prob_predictions)
        losses.append(lloss)

        acc = accuracy_score(y_valid, actual_pred)
        mcc = matthews_corrcoef(y_valid, actual_pred)

        print(f"\tLog loss: {lloss}")
        print(f"\tAccuracy: {acc}")
        print(f"\tMCC: {mcc}")
        
        total_mcc += mcc
        total_acc += acc
        
        print(80*'.')

    mean_mcc = total_mcc / 5
    mean_acc = total_acc / 5

    metrics_dict[model_name] = {
        'min_loss': min(losses),
        'max_loss': max(losses),
        'mean_loss': np.mean(losses),
        'mean_acc': mean_acc, 
        'mean_mcc': mean_mcc,

    } 
    
    print()
    print(80*'*')

    print()

    print(f"Min loss \t{min(losses)}")
    print(f"Max loss \t{max(losses)}")
    print(f"Mean loss\t{np.mean(losses)}")

    print()
    


********************************************************************************
********************************************************************************
LogisticRegression

LogisticRegression(max_iter=1000)

Fold # 1
	Log loss: 0.7721808983819242
	Accuracy: 0.6561278259391466
	MCC: 0.3564077227754731
................................................................................

Fold # 2
	Log loss: 0.7668818205606772
	Accuracy: 0.6562978072412035
	MCC: 0.3564633335267342
................................................................................

Fold # 3
	Log loss: 0.7733801027402161
	Accuracy: 0.6521591295477729
	MCC: 0.34763770809798955
................................................................................

Fold # 4
	Log loss: 0.7655158225723563
	Accuracy: 0.6591295477728664
	MCC: 0.36007840144893005
................................................................................

Fold # 5
	Log loss: 0.748670305385369
	Accuracy: 0.6725603536212172
	MCC: 0.



	Log loss: 0.7651672387189233
	Accuracy: 0.6632670406255312
	MCC: 0.364647974258482
................................................................................

Fold # 2




	Log loss: 0.7636089110450549
	Accuracy: 0.665646778854326
	MCC: 0.36980306000678254
................................................................................

Fold # 3




	Log loss: 0.7630455431498148
	Accuracy: 0.6628697721863311
	MCC: 0.36315388197981685
................................................................................

Fold # 4




	Log loss: 0.7668111024286128
	Accuracy: 0.6524991499489969
	MCC: 0.34127705791147883
................................................................................

Fold # 5




	Log loss: 0.7425925319916824
	Accuracy: 0.6766405984359062
	MCC: 0.3940122756129144
................................................................................

********************************************************************************

Min loss 	0.7425925319916824
Max loss 	0.7668111024286128
Mean loss	0.7602450654668177


********************************************************************************
********************************************************************************
LGBMClassifier

LGBMClassifier()

Fold # 1




	Log loss: 0.7575762227301074
	Accuracy: 0.6683664796872344
	MCC: 0.37732358678605343
................................................................................

Fold # 2




	Log loss: 0.7529550437993643
	Accuracy: 0.6676865544790073
	MCC: 0.3759427085244381
................................................................................

Fold # 3




	Log loss: 0.7580091366374772
	Accuracy: 0.6652499149948997
	MCC: 0.3703807723027055
................................................................................

Fold # 4




	Log loss: 0.7587350748385786
	Accuracy: 0.6582794967698062
	MCC: 0.35566319911085575
................................................................................

Fold # 5




	Log loss: 0.7331417992302278
	Accuracy: 0.6780006800408025
	MCC: 0.3975987148837296
................................................................................

********************************************************************************

Min loss 	0.7331417992302278
Max loss 	0.7587350748385786
Mean loss	0.7520834554471512


********************************************************************************
********************************************************************************
ExtraTreesClassifier

ExtraTreesClassifier(n_jobs=-1)

Fold # 1
	Log loss: 0.8458589326515719
	Accuracy: 0.6304606493285738
	MCC: 0.2826978634933263
................................................................................

Fold # 2
	Log loss: 0.807802068098926
	Accuracy: 0.6394696583375828
	MCC: 0.3083945411394423
................................................................................

Fold # 3
	Log loss: 0.8264758901019268
	Accuracy: 0.634308058483509
	MCC: 0.29644271301959096


## POS features

In [64]:
import spacy
nlp = spacy.load('en_core_web_lg', disable=['lemmatizer', 'tok2vec'])

2022-08-23 04:53:40.857132: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 04:53:40.873251: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-08-23 04:53:40.873263: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [65]:
#### This should be replaced by a pipe, for faster processing

def extract_features(text):
    doc = nlp(text)
    tokens_ = []
    poss_ = []
    tags_ = []
    morphs_ = []

    for token in doc:
        tokens_.append(token.text)
        poss_.append(token.pos_)
        tags_.append(token.tag_)
        morphs_.append(token.morph)

    return (tokens_, poss_, tags_, morphs_)
    

In [72]:
X_train['discourse_features_all'] = X_train.discourse_text.map(extract_features)
X_train['essay_features_all'] =  X_train.essay_text.map(extract_features)
t_features = X_train.discourse_features_all.apply(pd.Series)
t_features.columns = ['discourse_tokens', 'discourse_pos', 'discourse_tag', 'discourse_morph']

e_features = X_train.essay_features_all.apply(pd.Series)
e_features.columns = ['essay_tokens', 'essay_pos', 'essay_tag', 'essay_morph']

X_train = pd.concat([X_train, t_features, e_features], axis = 1)

## POS and shallow

In [95]:

metrics_dict_2 = {}

ohe = OneHotEncoder()

text_vectorizer = TfidfVectorizer()
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
standard_scaler = StandardScaler()


for model in models:
    model_name = type(model).__name__
    
    print()
    print(80*'*')
    print(80*'*')

    print(f"{model_name}")
    print()
    print(model.__str__())
    
    stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
    losses = []

    total_mcc = 0
    total_acc = 0
    
    for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
        print(f"\nFold # {idx + 1}")

        x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
        x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

        y_train = y_train.to_numpy()
        y_valid = y_valid.to_numpy()

        # text_vectorizer.fit(x_train.essay_text.drop_duplicates())

        # vectorized_discourse_text_train = text_vectorizer.transform(x_train.discourse_text)
        # vectorized_discourse_text_valid = text_vectorizer.transform(x_valid.discourse_text)

        ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
        ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

        incorrect_count_train = standard_scaler.fit_transform(x_train['incorrect_words_len'].to_numpy().reshape(-1, 1))
        incorrect_count_valid = standard_scaler.transform(x_valid['incorrect_words_len'].to_numpy().reshape(-1, 1))

        rp_start_train = x_train['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_train = x_train['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_train = x_train['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_train = x_train['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_train = x_train['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        
        rp_start_valid = x_valid['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_valid = x_valid['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_valid = x_valid['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_valid = x_valid['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_valid = x_valid['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)

        pos_features_train = pos_vectorizer.fit_transform(x_train.discourse_pos.map(' '.join))
        pos_features_valid = pos_vectorizer.transform(x_valid.discourse_pos.map(' '.join))

        x_train = sparse.hstack([pos_features_train, ohe_discourse_type_train, incorrect_count_train, rp_start_train, rp_end_train, wps_train, sps_train, spw_train])
        x_valid = sparse.hstack([pos_features_valid, ohe_discourse_type_test, incorrect_count_valid,  rp_start_valid, rp_end_valid, wps_valid, sps_valid, spw_valid])
        

        model.fit(x_train, y_train)

        prob_predictions = model.predict_proba(x_valid)
        actual_pred = model.predict(x_valid)

        lloss = log_loss(y_valid, prob_predictions)
        losses.append(lloss)

        acc = accuracy_score(y_valid, actual_pred)
        mcc = matthews_corrcoef(y_valid, actual_pred)

        print(f"\tLog loss: {lloss}")
        print(f"\tAccuracy: {acc}")
        print(f"\tMCC: {mcc}")
        
        total_mcc += mcc
        total_acc += acc
        
        print(80*'.')

    mean_mcc = total_mcc / 5
    mean_acc = total_acc / 5

    metrics_dict_2[model_name] = {
        'min_loss': min(losses),
        'max_loss': max(losses),
        'mean_loss': np.mean(losses),
        'mean_acc': mean_acc, 
        'mean_mcc': mean_mcc,

    } 
    
    print()
    print(80*'*')

    print()

    print(f"Min loss \t{min(losses)}")
    print(f"Max loss \t{max(losses)}")
    print(f"Mean loss\t{np.mean(losses)}")

    print()
    


********************************************************************************
********************************************************************************
LogisticRegression

LogisticRegression(max_iter=1000)

Fold # 1
	Log loss: 0.9101333378745715
	Accuracy: 0.588305286418494
	MCC: 0.15493537236203594
................................................................................

Fold # 2
	Log loss: 0.9039179078280468
	Accuracy: 0.5894951555328913
	MCC: 0.15816879749434298
................................................................................

Fold # 3
	Log loss: 0.9066648100775381
	Accuracy: 0.5855151309078545
	MCC: 0.14418550002987351
................................................................................

Fold # 4
	Log loss: 0.8998276693664443
	Accuracy: 0.5904454267256035
	MCC: 0.1611357361819912
................................................................................

Fold # 5
	Log loss: 0.8981872979545485
	Accuracy: 0.5894253655219314
	MCC: 0



	Log loss: 0.8030043816835163
	Accuracy: 0.6442291347951725
	MCC: 0.32555479924067204
................................................................................

Fold # 2




	Log loss: 0.8020393522056166
	Accuracy: 0.6477987421383647
	MCC: 0.333413352427533
................................................................................

Fold # 3




	Log loss: 0.8036751204801605
	Accuracy: 0.6445086705202312
	MCC: 0.3247963435167674
................................................................................

Fold # 4




	Log loss: 0.8042295533406955
	Accuracy: 0.6417885073104386
	MCC: 0.3192972059039802
................................................................................

Fold # 5




	Log loss: 0.7635146597786169
	Accuracy: 0.6640598435906154
	MCC: 0.36858025835773917
................................................................................

********************************************************************************

Min loss 	0.7635146597786169
Max loss 	0.8042295533406955
Mean loss	0.7952926134977212


********************************************************************************
********************************************************************************
LGBMClassifier

LGBMClassifier()

Fold # 1




	Log loss: 0.7938285454655596
	Accuracy: 0.6505184429712731
	MCC: 0.335480730100658
................................................................................

Fold # 2




	Log loss: 0.79568594555847
	Accuracy: 0.6537480877103519
	MCC: 0.3420156680336404
................................................................................

Fold # 3




	Log loss: 0.7930764940912804
	Accuracy: 0.6506290377422645
	MCC: 0.3343761742235196
................................................................................

Fold # 4




	Log loss: 0.7989706657272928
	Accuracy: 0.6431485889153349
	MCC: 0.31845219801871444
................................................................................

Fold # 5




	Log loss: 0.7649250616062437
	Accuracy: 0.6696701802108126
	MCC: 0.3779438711047319
................................................................................

********************************************************************************

Min loss 	0.7649250616062437
Max loss 	0.7989706657272928
Mean loss	0.7892973424897693


********************************************************************************
********************************************************************************
ExtraTreesClassifier

ExtraTreesClassifier(n_jobs=-1)

Fold # 1
	Log loss: 1.1351061811595624
	Accuracy: 0.5908550059493456
	MCC: 0.23287827086214702
................................................................................

Fold # 2
	Log loss: 1.2130840598216484
	Accuracy: 0.59799422063573
	MCC: 0.24920711346594696
................................................................................

Fold # 3
	Log loss: 1.130443113558469
	Accuracy: 0.5921455287317239
	MCC: 0.23479561442059757

## Text-features, POS and shallow..

In [97]:
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
metrics_dict_3 = {}

ohe = OneHotEncoder()
text_vectorizer = TfidfVectorizer()
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
standard_scaler = StandardScaler()


for model in models:
    model_name = type(model).__name__
    
    print()
    print(80*'*')
    print(80*'*')

    print(f"{model_name}")
    print()
    print(model.__str__())
    
    stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
    losses = []

    total_mcc = 0
    total_acc = 0
    for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
        print(f"\nFold # {idx + 1}")

        x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
        x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

        y_train = y_train.to_numpy()
        y_valid = y_valid.to_numpy()

        text_vectorizer.fit(x_train.essay_text.drop_duplicates())

        vectorized_discourse_text_train = text_vectorizer.transform(x_train.discourse_text)
        vectorized_discourse_text_valid = text_vectorizer.transform(x_valid.discourse_text)

        ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
        ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

        incorrect_count_train = standard_scaler.fit_transform(x_train['incorrect_words_len'].to_numpy().reshape(-1, 1))
        incorrect_count_valid = standard_scaler.transform(x_valid['incorrect_words_len'].to_numpy().reshape(-1, 1))

        rp_start_train = x_train['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_train = x_train['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_train = x_train['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_train = x_train['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_train = x_train['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        
        rp_start_valid = x_valid['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_valid = x_valid['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_valid = x_valid['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_valid = x_valid['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_valid = x_valid['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)

        pos_features_train = pos_vectorizer.fit_transform(x_train.discourse_pos.map(' '.join))
        pos_features_valid = pos_vectorizer.transform(x_valid.discourse_pos.map(' '.join))

        x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train, pos_features_train, incorrect_count_train, rp_start_train, rp_end_train, wps_train, sps_train, spw_train])
        x_valid = sparse.hstack([vectorized_discourse_text_valid, ohe_discourse_type_test, pos_features_valid, incorrect_count_valid,  rp_start_valid, rp_end_valid, wps_valid, sps_valid, spw_valid])
        

        model.fit(x_train, y_train)

        prob_predictions = model.predict_proba(x_valid)
        actual_pred = model.predict(x_valid)

        lloss = log_loss(y_valid, prob_predictions)
        losses.append(lloss)

        acc = accuracy_score(y_valid, actual_pred)
        mcc = matthews_corrcoef(y_valid, actual_pred)

        print(f"\tLog loss: {lloss}")
        print(f"\tAccuracy: {acc}")
        print(f"\tMCC: {mcc}")
        
        total_mcc += mcc
        total_acc += acc
        
        print(80*'.')

    mean_mcc = total_mcc / 5
    mean_acc = total_acc / 5

    metrics_dict_3[model_name] = {
        'min_loss': min(losses),
        'max_loss': max(losses),
        'mean_loss': np.mean(losses),
        'mean_acc': mean_acc, 
        'mean_mcc': mean_mcc,

    } 
    
    print()
    print(80*'*')

    print()

    print(f"Min loss \t{min(losses)}")
    print(f"Max loss \t{max(losses)}")
    print(f"Mean loss\t{np.mean(losses)}")

    print()
    
    


********************************************************************************
********************************************************************************
LogisticRegression

LogisticRegression(max_iter=1000)

Fold # 1
	Log loss: 0.7705963692005907
	Accuracy: 0.65595784463709
	MCC: 0.35665416016647433
................................................................................

Fold # 2
	Log loss: 0.7636174660116537
	Accuracy: 0.6579976202617712
	MCC: 0.36028031492572027
................................................................................

Fold # 3
	Log loss: 0.7721678256656753
	Accuracy: 0.6540292417545053
	MCC: 0.3517270497635027
................................................................................

Fold # 4
	Log loss: 0.7633721651470433
	Accuracy: 0.6584495069704183
	MCC: 0.35914826043801795
................................................................................

Fold # 5
	Log loss: 0.7468061674669159
	Accuracy: 0.6720503230193812
	MCC: 0.



	Log loss: 0.7597860706197634
	Accuracy: 0.6613972463029066
	MCC: 0.36079040565462545
................................................................................

Fold # 2




	Log loss: 0.7577922335834093
	Accuracy: 0.6624171341152473
	MCC: 0.3643572994472992
................................................................................

Fold # 3




	Log loss: 0.7617651269594536
	Accuracy: 0.6608296497789867
	MCC: 0.35921050700623275
................................................................................

Fold # 4




	Log loss: 0.7584690873914334
	Accuracy: 0.6613396803808228
	MCC: 0.36085557756036957
................................................................................

Fold # 5




	Log loss: 0.7377060311779143
	Accuracy: 0.6764705882352942
	MCC: 0.3938447383885428
................................................................................

********************************************************************************

Min loss 	0.7377060311779143
Max loss 	0.7617651269594536
Mean loss	0.7551037099463949


********************************************************************************
********************************************************************************
LGBMClassifier

LGBMClassifier()

Fold # 1




	Log loss: 0.7480765381457647
	Accuracy: 0.6710861805201428
	MCC: 0.38323858997953253
................................................................................

Fold # 2




	Log loss: 0.7464304527094678
	Accuracy: 0.6690464048954615
	MCC: 0.37897039229049634
................................................................................

Fold # 3




	Log loss: 0.7520788679666738
	Accuracy: 0.6642298537912275
	MCC: 0.36796321524405856
................................................................................

Fold # 4




	Log loss: 0.7472275531429659
	Accuracy: 0.6660999659979598
	MCC: 0.37256274388486027
................................................................................

Fold # 5




	Log loss: 0.7244428339683451
	Accuracy: 0.6827609656579394
	MCC: 0.4076017648952945
................................................................................

********************************************************************************

Min loss 	0.7244428339683451
Max loss 	0.7520788679666738
Mean loss	0.7436512491866434


********************************************************************************
********************************************************************************
ExtraTreesClassifier

ExtraTreesClassifier(n_jobs=-1)

Fold # 1
	Log loss: 0.8253641548034651
	Accuracy: 0.6350501444841068
	MCC: 0.2947747407261423
................................................................................

Fold # 2
	Log loss: 0.8105449935896141
	Accuracy: 0.6450790413054563
	MCC: 0.32474878754390457
................................................................................

Fold # 3
	Log loss: 0.8119073776167025
	Accuracy: 0.6363481808908534
	MCC: 0.301032700922472

In [99]:
X_train['contains_question'] = X_train.discourse_text.str.contains('\?').astype(int)

In [103]:
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
metrics_dict_4 = {}

ohe = OneHotEncoder()
text_vectorizer = TfidfVectorizer()
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
standard_scaler = StandardScaler()


for model in models:
    model_name = type(model).__name__
    
    print()
    print(80*'*')
    print(80*'*')

    print(f"{model_name}")
    print()
    print(model.__str__())
    
    stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
    losses = []

    total_mcc = 0
    total_acc = 0
    for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
        print(f"\nFold # {idx + 1}")

        x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
        x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

        y_train = y_train.to_numpy()
        y_valid = y_valid.to_numpy()

        text_vectorizer.fit(x_train.essay_text.drop_duplicates())

        vectorized_discourse_text_train = text_vectorizer.transform(x_train.discourse_text)
        vectorized_discourse_text_valid = text_vectorizer.transform(x_valid.discourse_text)

        ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
        ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

        incorrect_count_train = standard_scaler.fit_transform(x_train['incorrect_words_len'].to_numpy().reshape(-1, 1))
        incorrect_count_valid = standard_scaler.transform(x_valid['incorrect_words_len'].to_numpy().reshape(-1, 1))

        rp_start_train = x_train['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_train = x_train['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_train = x_train['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_train = x_train['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_train = x_train['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        cqt_train = x_train['contains_question'].to_numpy().reshape(-1, 1).astype(np.float32)

        rp_start_valid = x_valid['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_valid = x_valid['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_valid = x_valid['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_valid = x_valid['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_valid = x_valid['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        cqt_valid = x_valid['contains_question'].to_numpy().reshape(-1, 1).astype(np.float32)

        pos_features_train = pos_vectorizer.fit_transform(x_train.discourse_pos.map(' '.join))
        pos_features_valid = pos_vectorizer.transform(x_valid.discourse_pos.map(' '.join))

        x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train, pos_features_train, incorrect_count_train, rp_start_train, rp_end_train, wps_train, sps_train, spw_train,cqt_train])
        x_valid = sparse.hstack([vectorized_discourse_text_valid, ohe_discourse_type_test, pos_features_valid, incorrect_count_valid,  rp_start_valid, rp_end_valid, wps_valid, sps_valid, spw_valid, cqt_valid])
        

        model.fit(x_train, y_train)

        prob_predictions = model.predict_proba(x_valid)
        actual_pred = model.predict(x_valid)

        lloss = log_loss(y_valid, prob_predictions)
        losses.append(lloss)

        acc = accuracy_score(y_valid, actual_pred)
        mcc = matthews_corrcoef(y_valid, actual_pred)

        print(f"\tLog loss: {lloss}")
        print(f"\tAccuracy: {acc}")
        print(f"\tMCC: {mcc}")
        
        total_mcc += mcc
        total_acc += acc
        
        print(80*'.')

    mean_mcc = total_mcc / 5
    mean_acc = total_acc / 5

    metrics_dict_4[model_name] = {
        'min_loss': min(losses),
        'max_loss': max(losses),
        'mean_loss': np.mean(losses),
        'mean_acc': mean_acc, 
        'mean_mcc': mean_mcc,

    } 
    
    print()
    print(80*'*')

    print()

    print(f"Min loss \t{min(losses)}")
    print(f"Max loss \t{max(losses)}")
    print(f"Mean loss\t{np.mean(losses)}")

    print()


********************************************************************************
********************************************************************************
LogisticRegression

LogisticRegression(max_iter=1000)

Fold # 1
	Log loss: 0.7703378799719015
	Accuracy: 0.6540880503144654
	MCC: 0.35304907836407257
................................................................................

Fold # 2
	Log loss: 0.7633629428366933
	Accuracy: 0.6562978072412035
	MCC: 0.3568880725703735
................................................................................

Fold # 3
	Log loss: 0.7716869968822638
	Accuracy: 0.6545392723563414
	MCC: 0.3527759213405058
................................................................................

Fold # 4
	Log loss: 0.7633300080192839
	Accuracy: 0.657089425365522
	MCC: 0.3565296453431742
................................................................................

Fold # 5
	Log loss: 0.7472652309513227
	Accuracy: 0.6722203332199932
	MCC: 0.3



	Log loss: 0.7588666997212438
	Accuracy: 0.6622471528131906
	MCC: 0.36290241863720907
................................................................................

Fold # 2




	Log loss: 0.7578434216565579
	Accuracy: 0.6666666666666666
	MCC: 0.372973505499524
................................................................................

Fold # 3




	Log loss: 0.7612158324536926
	Accuracy: 0.661849710982659
	MCC: 0.36137251241922014
................................................................................

Fold # 4




	Log loss: 0.7598843677089347
	Accuracy: 0.662019721183271
	MCC: 0.3624519983889905
................................................................................

Fold # 5




	Log loss: 0.7370382852622601
	Accuracy: 0.6780006800408025
	MCC: 0.3970556163056401
................................................................................

********************************************************************************

Min loss 	0.7370382852622601
Max loss 	0.7612158324536926
Mean loss	0.7549697213605379


********************************************************************************
********************************************************************************
LGBMClassifier

LGBMClassifier()

Fold # 1




	Log loss: 0.7463377983595622
	Accuracy: 0.6707462179160293
	MCC: 0.38249454356819035
................................................................................

Fold # 2




	Log loss: 0.7458324841580299
	Accuracy: 0.667176610572837
	MCC: 0.37495120284266054
................................................................................

Fold # 3




	Log loss: 0.751389072621329
	Accuracy: 0.6678000680040802
	MCC: 0.3755633444332696
................................................................................

Fold # 4




	Log loss: 0.7483308170082826
	Accuracy: 0.6630397823869432
	MCC: 0.3664218172744422
................................................................................

Fold # 5




	Log loss: 0.7225697317441382
	Accuracy: 0.6863311798707923
	MCC: 0.41517582225676947
................................................................................

********************************************************************************

Min loss 	0.7225697317441382
Max loss 	0.751389072621329
Mean loss	0.7428919807782683


********************************************************************************
********************************************************************************
ExtraTreesClassifier

ExtraTreesClassifier(n_jobs=-1)

Fold # 1
	Log loss: 0.8145978120297883
	Accuracy: 0.6350501444841068
	MCC: 0.29513577473581926
................................................................................

Fold # 2
	Log loss: 0.8109692759728797
	Accuracy: 0.6449090600033996
	MCC: 0.32425837727235957
................................................................................

Fold # 3
	Log loss: 0.8116481749067733
	Accuracy: 0.6371982318939137
	MCC: 0.30347702886802

In [123]:
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
metrics_dict_5 = {}

ohe = OneHotEncoder()
text_vectorizer = TfidfVectorizer(ngram_range=(1,2))
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
standard_scaler = StandardScaler()


for model in models:
    model_name = type(model).__name__
    
    print()
    print(80*'*')
    print(80*'*')

    print(f"{model_name}")
    print()
    print(model.__str__())
    
    stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
    losses = []

    total_mcc = 0
    total_acc = 0
    for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
        print(f"\nFold # {idx + 1}")

        x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
        x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

        y_train = y_train.to_numpy()
        y_valid = y_valid.to_numpy()

        text_vectorizer.fit(x_train.essay_text.drop_duplicates())

        vectorized_discourse_text_train = text_vectorizer.transform(x_train.discourse_text)
        vectorized_discourse_text_valid = text_vectorizer.transform(x_valid.discourse_text)

        ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
        ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

        incorrect_count_train = standard_scaler.fit_transform(x_train['incorrect_words_len'].to_numpy().reshape(-1, 1))
        incorrect_count_valid = standard_scaler.transform(x_valid['incorrect_words_len'].to_numpy().reshape(-1, 1))

        rp_start_train = x_train['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_train = x_train['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_train = x_train['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_train = x_train['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_train = x_train['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        cqt_train = x_train['contains_question'].to_numpy().reshape(-1, 1).astype(np.float32)

        rp_start_valid = x_valid['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_valid = x_valid['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_valid = x_valid['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_valid = x_valid['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_valid = x_valid['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        cqt_valid = x_valid['contains_question'].to_numpy().reshape(-1, 1).astype(np.float32)

        pos_features_train = pos_vectorizer.fit_transform(x_train.discourse_pos.map(' '.join))
        pos_features_valid = pos_vectorizer.transform(x_valid.discourse_pos.map(' '.join))

        x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train, pos_features_train, incorrect_count_train, rp_start_train, rp_end_train, wps_train, sps_train, spw_train,cqt_train])
        x_valid = sparse.hstack([vectorized_discourse_text_valid, ohe_discourse_type_test, pos_features_valid, incorrect_count_valid,  rp_start_valid, rp_end_valid, wps_valid, sps_valid, spw_valid, cqt_valid])
        

        model.fit(x_train, y_train)

        prob_predictions = model.predict_proba(x_valid)
        actual_pred = model.predict(x_valid)

        lloss = log_loss(y_valid, prob_predictions)
        losses.append(lloss)

        acc = accuracy_score(y_valid, actual_pred)
        mcc = matthews_corrcoef(y_valid, actual_pred)

        print(f"\tLog loss: {lloss}")
        print(f"\tAccuracy: {acc}")
        print(f"\tMCC: {mcc}")
        
        total_mcc += mcc
        total_acc += acc
        
        print(80*'.')

    mean_mcc = total_mcc / 5
    mean_acc = total_acc / 5

    metrics_dict_5[model_name] = {
        'min_loss': min(losses),
        'max_loss': max(losses),
        'mean_loss': np.mean(losses),
        'mean_acc': mean_acc, 
        'mean_mcc': mean_mcc,

    } 
    
    print()
    print(80*'*')

    print()

    print(f"Min loss \t{min(losses)}")
    print(f"Max loss \t{max(losses)}")
    print(f"Mean loss\t{np.mean(losses)}")

    print()


********************************************************************************
********************************************************************************
LogisticRegression

LogisticRegression(max_iter=1000)

Fold # 1
	Log loss: 0.769322096239571
	Accuracy: 0.6605473397926228
	MCC: 0.3591225364127667
................................................................................

Fold # 2
	Log loss: 0.7583666222877341
	Accuracy: 0.6668366479687234
	MCC: 0.3728724282481718
................................................................................

Fold # 3
	Log loss: 0.7659107129855988
	Accuracy: 0.6611696701802108
	MCC: 0.3592661340796329
................................................................................

Fold # 4
	Log loss: 0.7633638855592403
	Accuracy: 0.661849710982659
	MCC: 0.36115421525829017
................................................................................

Fold # 5
	Log loss: 0.7461715144799874
	Accuracy: 0.6780006800408025
	MCC: 0.39



	Log loss: 0.7581591598863536
	Accuracy: 0.6651368349481557
	MCC: 0.3687684128938171
................................................................................

Fold # 2




	Log loss: 0.7547931606412256
	Accuracy: 0.669386367499575
	MCC: 0.3777110804715972
................................................................................

Fold # 3




	Log loss: 0.7608451558775289
	Accuracy: 0.6581094865691941
	MCC: 0.35293863264965264
................................................................................

Fold # 4




	Log loss: 0.7571708789185216
	Accuracy: 0.6591295477728664
	MCC: 0.3566557567455144
................................................................................

Fold # 5




	Log loss: 0.7376107596580113
	Accuracy: 0.6739204352261136
	MCC: 0.3877212332047436
................................................................................

********************************************************************************

Min loss 	0.7376107596580113
Max loss 	0.7608451558775289
Mean loss	0.7537158229963282


********************************************************************************
********************************************************************************
LGBMClassifier

LGBMClassifier()

Fold # 1




	Log loss: 0.7498311856040399
	Accuracy: 0.670916199218086
	MCC: 0.3824801292539629
................................................................................

Fold # 2




	Log loss: 0.7438533488315806
	Accuracy: 0.6734659187489376
	MCC: 0.388042246562318
................................................................................

Fold # 3




	Log loss: 0.754271322949002
	Accuracy: 0.6688201292077525
	MCC: 0.3773596999285909
................................................................................

Fold # 4




	Log loss: 0.7491561124740119
	Accuracy: 0.666439986399184
	MCC: 0.3727049577248767
................................................................................

Fold # 5




	Log loss: 0.7260123470604949
	Accuracy: 0.6825909554573274
	MCC: 0.407233405989076
................................................................................

********************************************************************************

Min loss 	0.7260123470604949
Max loss 	0.754271322949002
Mean loss	0.7446248633838259


********************************************************************************
********************************************************************************
ExtraTreesClassifier

ExtraTreesClassifier(n_jobs=-1)

Fold # 1
	Log loss: 0.8698432255259411
	Accuracy: 0.6156722760496346
	MCC: 0.24250293860968952
................................................................................

Fold # 2
	Log loss: 0.8486004091194415
	Accuracy: 0.6207717151113378
	MCC: 0.26161676105245085
................................................................................

Fold # 3
	Log loss: 0.8594565955300117
	Accuracy: 0.6162869772186331
	MCC: 0.2482468163806703

In [124]:
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
metrics_dict_6 = {}

ohe = OneHotEncoder()
text_vectorizer = TfidfVectorizer()
pos_vectorizer = TfidfVectorizer(ngram_range=(1,4))
standard_scaler = StandardScaler()


for model in models_2:
    model_name = type(model).__name__
    
    print()
    print(80*'*')
    print(80*'*')

    print(f"{model_name}")
    print()
    print(model.__str__())
    
    stratified_k = StratifiedKFold(n_splits = 5, shuffle=True, random_state = RANDOM_SEED)
    losses = []

    total_mcc = 0
    total_acc = 0
    for idx, (train_idxs, valid_idxs) in enumerate(stratified_k.split(X_train, Y_train)):
        print(f"\nFold # {idx + 1}")

        x_train, y_train = X_train.iloc[train_idxs], Y_train.iloc[train_idxs]
        x_valid, y_valid = X_train.iloc[valid_idxs], Y_train.iloc[valid_idxs]

        y_train = y_train.to_numpy()
        y_valid = y_valid.to_numpy()

        text_vectorizer.fit(x_train.essay_text.drop_duplicates())

        vectorized_discourse_text_train = text_vectorizer.transform(x_train.discourse_text)
        vectorized_discourse_text_valid = text_vectorizer.transform(x_valid.discourse_text)

        ohe_discourse_type_train = ohe.fit_transform(x_train.discourse_type.values.reshape(-1, 1))
        ohe_discourse_type_test = ohe.transform(x_valid.discourse_type.values.reshape(-1, 1))

        incorrect_count_train = standard_scaler.fit_transform(x_train['incorrect_words_len'].to_numpy().reshape(-1, 1))
        incorrect_count_valid = standard_scaler.transform(x_valid['incorrect_words_len'].to_numpy().reshape(-1, 1))

        rp_start_train = x_train['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_train = x_train['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_train = x_train['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_train = x_train['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_train = x_train['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        cqt_train = x_train['contains_question'].to_numpy().reshape(-1, 1).astype(np.float32)

        rp_start_valid = x_valid['rp_start'].to_numpy().reshape(-1, 1).astype(np.float32)
        rp_end_valid = x_valid['rp_end'].to_numpy().reshape(-1, 1).astype(np.float32)
        wps_valid = x_valid['words_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        sps_valid = x_valid['syll_per_sentences'].to_numpy().reshape(-1, 1).astype(np.float32)
        spw_valid = x_valid['syll_per_words'].to_numpy().reshape(-1, 1).astype(np.float32)
        cqt_valid = x_valid['contains_question'].to_numpy().reshape(-1, 1).astype(np.float32)

        pos_features_train = pos_vectorizer.fit_transform(x_train.discourse_pos.map(' '.join))
        pos_features_valid = pos_vectorizer.transform(x_valid.discourse_pos.map(' '.join))

        x_train = sparse.hstack([vectorized_discourse_text_train, ohe_discourse_type_train, pos_features_train, incorrect_count_train, rp_start_train, rp_end_train, wps_train, sps_train, spw_train,cqt_train])
        x_valid = sparse.hstack([vectorized_discourse_text_valid, ohe_discourse_type_test, pos_features_valid, incorrect_count_valid,  rp_start_valid, rp_end_valid, wps_valid, sps_valid, spw_valid, cqt_valid])
        
        x_train = x_train.todense()
        x_valid = x_valid.todense()

        model.fit(x_train, y_train)

        prob_predictions = model.predict_proba(x_valid)
        actual_pred = model.predict(x_valid)

        lloss = log_loss(y_valid, prob_predictions)
        losses.append(lloss)

        acc = accuracy_score(y_valid, actual_pred)
        mcc = matthews_corrcoef(y_valid, actual_pred)

        print(f"\tLog loss: {lloss}")
        print(f"\tAccuracy: {acc}")
        print(f"\tMCC: {mcc}")
        
        total_mcc += mcc
        total_acc += acc
        
        print(80*'.')

    mean_mcc = total_mcc / 5
    mean_acc = total_acc / 5

    metrics_dict_6[model_name] = {
        'min_loss': min(losses),
        'max_loss': max(losses),
        'mean_loss': np.mean(losses),
        'mean_acc': mean_acc, 
        'mean_mcc': mean_mcc,

    } 
    
    print()
    print(80*'*')

    print()

    print(f"Min loss \t{min(losses)}")
    print(f"Max loss \t{max(losses)}")
    print(f"Mean loss\t{np.mean(losses)}")

    print()


********************************************************************************
********************************************************************************
HistGradientBoostingClassifier

HistGradientBoostingClassifier()

Fold # 1




	Log loss: 0.7546528345401939
	Accuracy: 0.6612272650008499
	MCC: 0.36306092393405237
................................................................................

Fold # 2




	Log loss: 0.7447359591732509
	Accuracy: 0.6698963114057453
	MCC: 0.38391821296423895
................................................................................

Fold # 3




	Log loss: 0.7534947934089871
	Accuracy: 0.6604896293777627
	MCC: 0.3628508770659367
................................................................................

Fold # 4




	Log loss: 0.748426349226227
	Accuracy: 0.6678000680040802
	MCC: 0.37655232834272073
................................................................................

Fold # 5




	Log loss: 0.7238574134944064
	Accuracy: 0.6793607616456987
	MCC: 0.4017354443621247
................................................................................

********************************************************************************

Min loss 	0.7238574134944064
Max loss 	0.7546528345401939
Mean loss	0.7450334699686131





In [128]:
dicts_of_interest = [metrics_dict, metrics_dict_2, metrics_dict_3, metrics_dict_4, metrics_dict_5, metrics_dict_6]
dicts_name = ['text_only', 'text_shallow', 'text_shallow_pos', 'tfidf_(1,1)', 'tfidf_(1,2)', 'hgbm_(1,1)']
metric_of_interest = 'mean_loss'

collector = []
for idx, dict_of_interest in enumerate(dicts_of_interest):
    for model_name, model_metrics in dict_of_interest.items():
        collector.append((model_name, f"{dicts_name[idx]}", model_metrics[metric_of_interest]))

print(f'{metric_of_interest}')
for i in sorted(collector, key = lambda item: item[2]):
    print(i)
print(80*'*')


mean_loss
('LGBMClassifier', 'tfidf_(1,1)', 0.7428919807782683)
('LGBMClassifier', 'text_shallow_pos', 0.7436512491866434)
('LGBMClassifier', 'tfidf_(1,2)', 0.7446248633838259)
('HistGradientBoostingClassifier', 'hgbm_(1,1)', 0.7450334699686131)
('LGBMClassifier', 'text_only', 0.7520834554471512)
('XGBClassifier', 'tfidf_(1,2)', 0.7537158229963282)
('XGBClassifier', 'tfidf_(1,1)', 0.7549697213605379)
('XGBClassifier', 'text_shallow_pos', 0.7551037099463949)
('XGBClassifier', 'text_only', 0.7602450654668177)
('LogisticRegression', 'tfidf_(1,2)', 0.7606269663104264)
('LogisticRegression', 'tfidf_(1,1)', 0.7631966117322929)
('LogisticRegression', 'text_shallow_pos', 0.7633119986983758)
('LogisticRegression', 'text_only', 0.7653257899281086)
('LGBMClassifier', 'text_shallow', 0.7892973424897693)
('XGBClassifier', 'text_shallow', 0.7952926134977212)
('ExtraTreesClassifier', 'tfidf_(1,1)', 0.8094006927746504)
('ExtraTreesClassifier', 'text_shallow_pos', 0.811773215438004)
('RandomForestClass

In [129]:
metric_of_interest = 'min_loss'

collector = []
for idx, dict_of_interest in enumerate(dicts_of_interest):
    for model_name, model_metrics in dict_of_interest.items():
        collector.append((model_name, f"{dicts_name[idx]}", model_metrics[metric_of_interest]))

print(f'{metric_of_interest}')
for i in sorted(collector, key = lambda item: item[2]):
    print(i)
print(80*'*')

min_loss
('LGBMClassifier', 'tfidf_(1,1)', 0.7225697317441382)
('HistGradientBoostingClassifier', 'hgbm_(1,1)', 0.7238574134944064)
('LGBMClassifier', 'text_shallow_pos', 0.7244428339683451)
('LGBMClassifier', 'tfidf_(1,2)', 0.7260123470604949)
('LGBMClassifier', 'text_only', 0.7331417992302278)
('XGBClassifier', 'tfidf_(1,1)', 0.7370382852622601)
('XGBClassifier', 'tfidf_(1,2)', 0.7376107596580113)
('XGBClassifier', 'text_shallow_pos', 0.7377060311779143)
('XGBClassifier', 'text_only', 0.7425925319916824)
('LogisticRegression', 'tfidf_(1,2)', 0.7461715144799874)
('LogisticRegression', 'text_shallow_pos', 0.7468061674669159)
('LogisticRegression', 'tfidf_(1,1)', 0.7472652309513227)
('LogisticRegression', 'text_only', 0.748670305385369)
('XGBClassifier', 'text_shallow', 0.7635146597786169)
('LGBMClassifier', 'text_shallow', 0.7649250616062437)
('ExtraTreesClassifier', 'text_only', 0.7995769027439527)
('ExtraTreesClassifier', 'tfidf_(1,1)', 0.8044192804091473)
('ExtraTreesClassifier', 't

In [132]:
metric_of_interest = 'mean_mcc'

collector = []
for idx, dict_of_interest in enumerate(dicts_of_interest):
    for model_name, model_metrics in dict_of_interest.items():
        collector.append((model_name, f"{dicts_name[idx]}", model_metrics[metric_of_interest]))

print(f'{metric_of_interest}')
for i in reversed(sorted(collector, key = lambda item: item[2])):
    print(i)
print(80*'*')

mean_mcc
('LGBMClassifier', 'tfidf_(1,2)', 0.3855640878917649)
('LGBMClassifier', 'tfidf_(1,1)', 0.3829213460750664)
('LGBMClassifier', 'text_shallow_pos', 0.38206734125884845)
('HistGradientBoostingClassifier', 'hgbm_(1,1)', 0.37762355733381475)
('LGBMClassifier', 'text_only', 0.37538179632155655)
('XGBClassifier', 'tfidf_(1,1)', 0.37135121025011675)
('LogisticRegression', 'tfidf_(1,2)', 0.3699148075427776)
('XGBClassifier', 'tfidf_(1,2)', 0.36875902319306497)
('XGBClassifier', 'text_shallow_pos', 0.36781170561141396)
('XGBClassifier', 'text_only', 0.3665788499538949)
('LogisticRegression', 'text_shallow_pos', 0.3632610571144278)
('LogisticRegression', 'text_only', 0.36194384592723133)
('LogisticRegression', 'tfidf_(1,1)', 0.361667616886424)
('LGBMClassifier', 'text_shallow', 0.34165372829625285)
('XGBClassifier', 'text_shallow', 0.3343283918893384)
('RandomForestClassifier', 'tfidf_(1,1)', 0.31492069943886036)
('ExtraTreesClassifier', 'tfidf_(1,1)', 0.30986778169577267)
('RandomFores

In [131]:
metric_of_interest = 'mean_acc'

collector = []
for idx, dict_of_interest in enumerate(dicts_of_interest):
    for model_name, model_metrics in dict_of_interest.items():
        collector.append((model_name, f"{dicts_name[idx]}", model_metrics[metric_of_interest]))

print(f'{metric_of_interest}')
for i in reversed(sorted(collector, key = lambda item: item[2])):
    print(i)
print(80*'*')

mean_acc
('LGBMClassifier', 'tfidf_(1,2)', 0.6724466378062576)
('LGBMClassifier', 'tfidf_(1,1)', 0.6710187717501365)
('LGBMClassifier', 'text_shallow_pos', 0.6706446741725463)
('HistGradientBoostingClassifier', 'hgbm_(1,1)', 0.6677548070868273)
('LGBMClassifier', 'text_only', 0.66751662519435)
('XGBClassifier', 'tfidf_(1,1)', 0.6661567863373179)
('LogisticRegression', 'tfidf_(1,2)', 0.6656808097930037)
('XGBClassifier', 'tfidf_(1,2)', 0.665136534403181)
('XGBClassifier', 'text_shallow_pos', 0.6644908597626514)
('XGBClassifier', 'text_only', 0.6641846680102182)
('LogisticRegression', 'text_shallow_pos', 0.6596969073286332)
('LogisticRegression', 'text_only', 0.6592549328244413)
('LogisticRegression', 'tfidf_(1,1)', 0.6588469776995051)
('LGBMClassifier', 'text_shallow', 0.6535428675100075)
('XGBClassifier', 'text_shallow', 0.6484769796709645)
('RandomForestClassifier', 'tfidf_(1,1)', 0.6426968409197532)
('RandomForestClassifier', 'text_shallow_pos', 0.6402828174449944)
('ExtraTreesClassi