# Summary

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np 

import textstat

from imp import *

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import cohen_kappa_score

import torch

from gensim.models import Word2Vec

import pickle

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb


  x = re.sub("@\w+", '', x)  # Remove mentions
  x = re.sub("'\d+", '', x)  # Remove contractions
  x = re.sub("\d+", '', x)  # Remove digits
  x = re.sub("http\w+", '', x)  # Remove URLs


# Data Preprocessing and Feature extraction

I took both the datasets then scaled their scores from 0 to 10 and combined both of these datasets with 3 columns in the combined dataset being the "essay_id", "full_text" and "score".

In [2]:
data1 = pd.read_csv("Datasets/train.csv")
data2 = pd.read_csv("Datasets/old_compdata.csv")
data2 = data2[["essay_id", "essay","final_score"]].copy()

data2.rename(columns={"essay":"full_text", "final_score":"score"}, inplace=True)
data1["score"] = data1["score"].apply(score_normalise)
combined_data = pd.concat([data1, data2], axis=0)
combined_data = combined_data.reset_index(drop=True)
combined_data

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,5
1,000fe60,I am a scientist at NASA that is discussing th...,5
2,001ab80,People always wish they had the same technolog...,7
3,001bdc0,"We all heard about Venus, the planet without a...",7
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",5
...,...,...,...
30278,21626,In most stories mothers and daughters are eith...,6
30279,21628,I never understood the meaning laughter is the...,5
30280,21629,"When you laugh, is out of habit, or is cause? ...",7
30281,21630,"Trippin' on fences I am years young, and in th...",7


The  function "dataPreprocessing" converts the text into lowercase, removes all the punctions, html tags, urls and much more, it also expands contractions like converting can't to cannot

In [3]:
#Apply text preprocessing
combined_data['cleaned_essay_text'] = combined_data['full_text'].apply(dataPreprocessing)


Here I have extracted some self implemented features

In [4]:

#Apply feature extraction
combined_data['word_count'], combined_data['avg_word_length'], combined_data['spell_error'], combined_data['sent_count'], combined_data['avg_sent_length'], combined_data['para_count'], combined_data['avg_para_length'] = zip(*combined_data['full_text'].apply(extract_features))

combined_data

Unnamed: 0,essay_id,full_text,score,cleaned_essay_text,word_count,avg_word_length,spell_error,sent_count,avg_sent_length,para_count,avg_para_length
0,000d118,Many people have car where they live. The thin...,5,many people have car where they live the thing...,545,4.007339,54,13,38.307692,1,13.0
1,000fe60,I am a scientist at NASA that is discussing th...,5,i am a scientist at nasa that is discussing th...,371,3.617251,34,23,15.809524,5,4.6
2,001ab80,People always wish they had the same technolog...,7,people always wish they had the same technolog...,605,4.178512,43,24,22.916667,4,6.0
3,001bdc0,"We all heard about Venus, the planet without a...",7,we all heard about venus the planet without al...,511,4.405088,48,24,21.571429,5,4.8
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",5,dear state senator this is a letter to argue i...,418,4.354067,41,15,23.437500,6,2.5
...,...,...,...,...,...,...,...,...,...,...,...
30278,21626,In most stories mothers and daughters are eith...,6,in most stories mothers and daughters are eith...,894,3.627517,74,26,30.692308,1,26.0
30279,21628,I never understood the meaning laughter is the...,5,i never understood the meaning laughter is the...,596,3.466443,62,39,13.666667,1,39.0
30280,21629,"When you laugh, is out of habit, or is cause? ...",7,when you laugh is out of habit or is cause wha...,883,3.853907,100,52,17.568182,1,52.0
30281,21630,"Trippin' on fences I am years young, and in th...",7,trippin on fences i am years young and in thos...,641,3.578783,73,39,15.083333,1,39.0


Here I have extracted features using textstat library

In [5]:
combined_data['textstat_features'] = combined_data['cleaned_essay_text'].apply(textstat_features)
train_textstat = pd.DataFrame(combined_data['textstat_features'].tolist())

# Ensure the indices are unique
combined_data = combined_data.reset_index(drop=True)
train_textstat = train_textstat.reset_index(drop=True)

#making final dataset
final_data = pd.concat([combined_data, train_textstat], axis=1)
final_data = final_data.drop(columns=["textstat_features", "full_text","essay_id"])
final_data

Unnamed: 0,score,cleaned_essay_text,word_count,avg_word_length,spell_error,sent_count,avg_sent_length,para_count,avg_para_length,flesch_reading_ease,automated_readability_index,difficult_words,text_standard,reading_time,syllable_count
0,5,many people have car where they live the thing...,545,4.007339,54,13,38.307692,1,13.0,-402.53,244.8,62,192.0,31.05,627
1,5,i am a scientist at nasa that is discussing th...,371,3.617251,34,23,15.809524,5,4.6,-234.72,164.3,24,0.0,19.04,401
2,7,people always wish they had the same technolog...,605,4.178512,43,24,22.916667,4,6.0,-469.86,274.6,66,0.0,36.14,767
3,7,we all heard about venus the planet without al...,511,4.405088,48,24,21.571429,5,4.8,-369.72,223.1,80,13.0,31.91,681
4,5,dear state senator this is a letter to argue i...,418,4.354067,41,15,23.437500,6,2.5,-297.65,186.9,58,0.0,25.93,560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30278,6,in most stories mothers and daughters are eith...,894,3.627517,74,26,30.692308,1,26.0,-705.68,396.5,48,0.0,45.91,986
30279,5,i never understood the meaning laughter is the...,596,3.466443,62,39,13.666667,1,39.0,-419.45,255.1,27,0.0,29.00,607
30280,7,when you laugh is out of habit or is cause wha...,883,3.853907,100,52,17.568182,1,52.0,-676.58,379.6,84,297.0,47.58,1001
30281,7,trippin on fences i am years young and in thos...,641,3.578783,73,39,15.083333,1,39.0,-452.94,272.3,47,0.0,32.16,686


Saving the final dataset

In [6]:
final_data.to_csv("Datasets/final_data.csv", index = False)

In [7]:
final_data = pd.read_csv("Datasets/final_data.csv")
final_data

Unnamed: 0,score,cleaned_essay_text,word_count,avg_word_length,spell_error,sent_count,avg_sent_length,para_count,avg_para_length,flesch_reading_ease,automated_readability_index,difficult_words,text_standard,reading_time,syllable_count
0,5,many people have car where they live the thing...,545,4.007339,54,13,38.307692,1,13.0,-402.53,244.8,62,192.0,31.05,627
1,5,i am a scientist at nasa that is discussing th...,371,3.617251,34,23,15.809524,5,4.6,-234.72,164.3,24,0.0,19.04,401
2,7,people always wish they had the same technolog...,605,4.178512,43,24,22.916667,4,6.0,-469.86,274.6,66,0.0,36.14,767
3,7,we all heard about venus the planet without al...,511,4.405088,48,24,21.571429,5,4.8,-369.72,223.1,80,13.0,31.91,681
4,5,dear state senator this is a letter to argue i...,418,4.354067,41,15,23.437500,6,2.5,-297.65,186.9,58,0.0,25.93,560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30278,6,in most stories mothers and daughters are eith...,894,3.627517,74,26,30.692308,1,26.0,-705.68,396.5,48,0.0,45.91,986
30279,5,i never understood the meaning laughter is the...,596,3.466443,62,39,13.666667,1,39.0,-419.45,255.1,27,0.0,29.00,607
30280,7,when you laugh is out of habit or is cause wha...,883,3.853907,100,52,17.568182,1,52.0,-676.58,379.6,84,297.0,47.58,1001
30281,7,trippin on fences i am years young and in thos...,641,3.578783,73,39,15.083333,1,39.0,-452.94,272.3,47,0.0,32.16,686


# Vectorization

This is a word level analysis using tf-idf

In [8]:
final_data["tokenized_text"] = final_data["cleaned_essay_text"].apply(word_tokenize)

from gensim.models import Word2Vec
w2v_model = Word2Vec(final_data["tokenized_text"], vector_size=1500, window=5, min_count=1, workers=4)

vect = np.array([get_avg_w2v_vector(essay, w2v_model) for essay in final_data["tokenized_text"]])

In [9]:
additional_features = np.array(final_data.drop(columns=['score', 'cleaned_essay_text', 'tokenized_text']))
X = np.hstack((vect, additional_features))
y = final_data["score"]

In [10]:
import joblib

joblib.dump(w2v_model, 'word2vec.pkl', compress=True)

['word2vec.pkl']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LGBM Model

In [12]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [13]:
# Assuming 5 folds and a random seed of 42 for reproducibility
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
models_lgbm = []

# Define parameters
params = {  
        'metrics': 'None',
        'learning_rate': 0.05,
        'max_depth': 5,
        'num_leaves': 10, # should be a number smaller than "max_depth"^2
        'colsample_bytree': 0.3,
        'min_data_in_leaf': 100,
        'reg_alpha': 0.7,
        'reg_lambda' : 0.1,
        'n_estimators': 700,
        'extra_trees' : True,
        'verbosity': -100}

for train_idx, val_idx in folds.split(X_train, y_train):
    # Create LightGBM datasets for training and validation folds
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    train_data_fold = lgb.Dataset(X_train_fold, label=y_train_fold.values)
    val_data_fold = lgb.Dataset(X_val_fold, label=y_val_fold.values, reference=train_data_fold)
    model1 = lgb.train(params, train_data_fold, valid_sets=[train_data_fold, val_data_fold])
    models_lgbm.append(model1)

    # Train the model on the current fold's training data
    



In [14]:
probabilities_lgbm = []
for model in models_lgbm:
    # Predict the probabilities for the test features using the selected features
    proba_lgbm = model.predict(X_test, num_iteration=model.best_iteration)
    probabilities_lgbm.append(proba_lgbm)

In [29]:
predictions_lgbm = np.mean(probabilities_lgbm, axis=0)
predictions_lgbm_clip = np.round(predictions_lgbm.clip(0, 10))

In [30]:
qwk2 = cohen_kappa_score(y_test, predictions_lgbm_clip, weights='quadratic')
qwk2

0.8068087350411762

In [17]:
with open('models_lgbm.pkl', 'wb') as f:
    pickle.dump(models_lgbm, f)


# XGBoost

In [18]:
models_xgb = []


for train_idx, val_idx in folds.split(X_train, y_train):
    # Create LightGBM datasets for training and validation folds
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    model2 = xgb.XGBRegressor(objective = 'reg:squarederror',
                eval_metric = 'rmse',
                learning_rate = 0.05,
                max_depth = 5,
                subsample = 0.8,
                min_child_weight = 5,
                n_estimators=1000,
                random_state=42,
                verbosity=0)
    model2.fit(X_train_fold, y_train_fold)
    models_xgb.append(model2)

In [19]:
probabilities_xgb = []
for model in models_xgb:
    # Predict the probabilities for the test features using the selected features
    proba_xgb = model.predict(X_test)
    probabilities_xgb.append(proba_lgbm)

In [31]:
predictions_xgb = np.mean(probabilities_xgb, axis=0)
predictions_xgb_clip = np.round(predictions_xgb.clip(0, 10))

In [32]:
qwk2 = cohen_kappa_score(y_test, predictions_xgb_clip, weights='quadratic')
qwk2

0.8036241351152345

In [39]:
with open('models_xgb.pkl', 'wb') as f:
    pickle.dump(models_xgb, f)

# Ensemble

In [34]:
prediction_final = np.array((predictions_lgbm, predictions_xgb))
prediction_final = np.mean(prediction_final, axis=0)

In [37]:
prediction_final_clip = np.round(prediction_final.clip(0, 10))

In [38]:
qwk2 = cohen_kappa_score(y_test, prediction_final_clip, weights='quadratic')
qwk2

0.8047519022416539