In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Learning Agency Lab - Automated Essay Scoring 2.0
* Data Exploration
* Data Setup
* Baseline Model Training
* Submission

# Loading Data

In [None]:
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
test_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')

# Data Exploration

In [None]:
# Overview
train_df.info()
test_df.info()

In [None]:
# Datatypes of each column
print(train_df.dtypes)
print(test_df.dtypes)


In [None]:
# First Few Rows
print(train_df.head())
print(test_df.head())


In [None]:
# Summary Statistics
print(train_df.describe())
print(test_df.describe())


In [None]:
# Check for missing values
print(train_df.isnull().sum())
print(test_df.isnull().sum())


In [None]:
# Unique values
print(train_df.nunique())
print(test_df.nunique())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Create the bar plot
plt.figure(figsize=(10, 6))  
ax = sns.countplot(x='score', data=train_df, palette='plasma')

# Add titles and labels
ax.set_title('Distribution of Scores in Train Dataset', fontsize=16)
ax.set_xlabel('Score', fontsize=12)
ax.set_ylabel('Count', fontsize=12)

# Show the plot
plt.show()


# Text Preprocessing

In [None]:
import re

class TextPreprocessor:
    def __init__(self):
        pass
    
    def to_lower(self, text):
        return text.lower()
    
    def remove_special_chars(self, text):
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return text
    
    def remove_new_lines_whitespace(self, text):
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    def preprocess(self, text):
        text = self.to_lower(text)
        #text = self.remove_special_chars(text)
        #text = self.remove_new_lines_whitespace(text)
        return text

In [None]:
# Testing preprocessor
preprocessor = TextPreprocessor()

sample_text = "Hello!!! This is an example: test for Pre-processing, with sp&cial char@cters."
processed_text = preprocessor.preprocess(sample_text)
print(processed_text)

In [None]:
# Apply preprocessor
train_df['full_text'] = train_df['full_text'].apply(preprocessor.preprocess)
test_df['full_text'] = test_df['full_text'].apply(preprocessor.preprocess)

In [None]:
# First Few Rows
print(train_df['full_text'])

In [None]:
print(test_df['full_text'])

# Feature Engineering

## Generating 4 New Feature Columns

In [None]:
class TextFeatureExtractor:
    def __init__(self):
        pass
    
    def word_count(self, text):
        return len(text.split())
    
    def sentence_count(self, text):
        return len(text.split('.'))
    
    def ave_word_length(self, text):
        """Return the average length of words in a text."""
        words = text.split()
        total_length = 0
        for word in words:
            total_length += len(word)
        if len(words) == 0:
            return 0
        else:
            return total_length / len(words)
    
    def lexical_diversity(self, text):
        """
        Return the lexical diversity of a text.
        Lexical diversity is a measure of how many different words are used in a text.
        It's calculated as the ratio of the number of unique words to the total number of words in the text.
        """
        words = text.split()
        if len(words) == 0:
            return 0
        return len(set(words)) / len(words)
    
    def extract_features(self, text):
        features = {
            'word_count': self.word_count(text),
            'sentence_count': self.sentence_count(text),
            'ave_word_length': self.avg_word_length(text),
            'lexical_diversity': self.lexical_diversity(text)
        }
        return features
        

In [None]:
def apply_text_features(df, text_column):
    extractor = TextFeatureExtractor()
    for feature in ['word_count', 'sentence_count', 'ave_word_length', 'lexical_diversity']:
        df[feature] = df[text_column].apply(lambda x: getattr(extractor, feature)(x))

apply_text_features(train_df, 'full_text')
apply_text_features(test_df, 'full_text')

In [None]:
train_df

In [None]:
test_df

## Visualizing Each Feature

In [None]:
print(train_df[['word_count', 'sentence_count', 'ave_word_length', 'lexical_diversity']].describe())


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))

feature_titles = ['Word Count', 'Sentence Count', 'Average Word Length', 'Lexical Diversity']

for i, feature in enumerate(['word_count', 'sentence_count', 'ave_word_length', 'lexical_diversity'],1
):
    plt.subplot(2,2,i)
    plt.hist(train_df[feature], bins=20, color='blue',alpha=0.7)
    plt.title(feature_titles[i - 1])

plt.tight_layout()
plt.show()
    


## Text Preparation

### Scaling the features

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, cohen_kappa_score
from sklearn.model_selection import KFold, GridSearchCV

# Isolate the essay_id for the submission file
test_essay_ids = test_df['essay_id']

# Exclude 'essay_id' from features
features_scale = ['word_count', 'sentence_count', 'ave_word_length', 'lexical_diversity']
scaler = StandardScaler()
# Scaling
train_df[features_scale] = scaler.fit_transform(train_df[features_scale])
test_df[features_scale] = scaler.transform(test_df[features_scale]) # don't fit


In [None]:
#print(train_df)
#print(test_df)

### Text Vectorization

TF-IDF (Term Frequency-Inverse Document Frequency)

In [None]:
# Vectorize text data
tfid_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfid_vectorizer.fit_transform(train_df['full_text'])
X_test_tfidf = tfid_vectorizer.transform(test_df['full_text'])  # Use transform here, don't fit

# Convert scaled features into sparse format
features_train = csr_matrix(train_df[features_scale])
features_test = csr_matrix(test_df[features_scale])

# Combine TF-IDF features with additional features
X_train_combined = hstack([X_train_tfidf, features_train])
X_test_combined = hstack([X_test_tfidf, features_test])

# Labels
y_train = train_df['score']


In [None]:
#print(X_train_combined)
#print(X_test_combined)

## Modeling
### Baseline Model with LGBM
I chose LightGBM for its efficiency in handling multiclass classification tasks on large datasets and paired it with Optuna for hyperparameter optimization. Optuna streamlines the model tuning process through systematic trials, optimizing parameters for better performance and robustness. This combination ensures the model is both powerful and practical for predictive accuracy. 

In [None]:
%%time

## Optuna version
import optuna
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split

# Labels for xgboost
y_train_xg = train_df['score'] -1

def objective(trial):
    # Suggest values for the hyperparameters
    param = {
        'objective': 'multiclass',  # Specify multiclass classification
        'metric': 'multi_logloss',  # Suitable for multiclass classification
        'num_class': len(np.unique(y_train)),  # Number of classes
        'verbosity': -1,
        'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'force_row_wise': True
    }
    
    # Split data for validation
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, y_train_xg, test_size=0.2, random_state=42)
    
    # Create a LightGBM classifier with suggested parameters
    clf = LGBMClassifier(**param)
    clf.fit(X_train_split, y_train_split)
    preds = clf.predict(X_val_split)
    
    # Use Cohen's Kappa Score for evaluation
    kappa = cohen_kappa_score(y_val_split, preds, weights='quadratic')
    return kappa

# Create a study object and specify the optimization direction
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)  # adjust the number of trials

print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
%%time

# Train the final model with the best parameters
best_params = trial.params
best_params['force_row_wise'] = True
print(best_params)
final_model = LGBMClassifier(**best_params)
final_model.fit(X_train_combined, y_train_xg)

In [None]:
# Predict the test set
y_test_pred = final_model.predict(X_test_combined)

y_test_pred = y_test_pred + 1

submission_df = pd.DataFrame({
    'essay_id': test_df['essay_id'],
    'score': y_test_pred
})

print("DataFrame shape:", submission_df.shape)
print("DataFrame sample:", submission_df)

In [None]:
# Create a submission file
submission_df.to_csv('submission.csv', index=False)
print("Files in current directory:", os.listdir('.'))