# Imports & Data Frame
Setting up imports and functions to process dataframes and features

In [None]:
import nltk
import pandas as pd
import subprocess

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist

# Ensure necessary NLTK downloads (w/o internet)
nltk.data.path.append('/kaggle/input/nltk-data/nltk_data')

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk import punkt

# Load data
train_file = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv'
df = pd.read_csv(train_file)

# Feature Generation

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

# Text processing algorithm, explained below
def comprehensive_text_preprocessing(text):
    # Lowercase and remove HTML tags
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    words = word_tokenize(text)

    # Remove stop words
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]

    # Stemming and Lemmatization
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words]

    return " ".join(words)  # Return the processed text

# Applying the comprehensive_text_preprocessing to the 'full_text' column
df['processed_text'] = df['full_text'].apply(comprehensive_text_preprocessing)

# Get length of words as a feature
def calculate_word_lengths(text):
    words = word_tokenize(text)
    return [len(word) for word in words]

df['word_lengths'] = df['processed_text'].apply(calculate_word_lengths)

# Get the average word lengths 
def average_word_length(text):
    words = text.split()  # Split the text into words
    if not words:
        return 0  # Avoid division by zero if there are no words
    total_length = sum(len(word) for word in words)
    return total_length / len(words)

# Calculating average word length
df['avg_word_length'] = df['processed_text'].apply(average_word_length)

# Get the length of all the essays
df['essay_length'] = df['processed_text'].apply(len)

# Group by 'score' and calculate the mean of 'avg_word_length' for each score
average_lengths_by_score = df.groupby('score')['avg_word_length'].mean().reset_index()

# Function to calculate sentence count
def calculate_sentence_count(text):
    sentences = sent_tokenize(text)
    return len(sentences)

# Calculate sentence count for each essay
df['sentence_count'] = df['full_text'].apply(calculate_sentence_count)

# Plot all our data for each feature to visualize trends related to score

plt.figure(figsize=(10, 6))
plt.scatter(df['essay_length'], df['score'], color='blue')
plt.title('Score vs Essay Length')
plt.xlabel('Essay Length')
plt.ylabel('Score')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 6))
plt.scatter(df['sentence_count'], df['score'], color='blue')
plt.title('Score vs Sentence Count')
plt.xlabel('Sentence Count')
plt.ylabel('Score')
plt.grid(True)
plt.show()

plt.figure(figsize=(10, 5))
plt.bar(average_lengths_by_score['score'], average_lengths_by_score['avg_word_length'], color='skyblue')
plt.xlabel('Score')
plt.ylabel('Average Word Length')
plt.title('Average Word Length by Essay Score')
plt.xticks(average_lengths_by_score['score'])  # Set x-ticks to be the scores
plt.show()

PreProcessing allows us to clean up our text input by:
* Removing punctuation
* Tokenizing the words
* Removing stop words
* Running Stemmer and Lemmatization functions

Then we generate some potential features such as:
* Essay Length
* Average Word Length
* Word Length
* Number of sentences

We then save this to a row called processed_text and also save a row giving us the lengths of each essay to analyze later.

# Word2Vec Model

In [None]:
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=df['processed_text'], vector_size=100, window=5, min_count=1, workers=4)

X = df['processed_text'] # Feature
y = df['score'] # Target

def get_average_embedding(text):
    words = comprehensive_text_preprocessing(text)
    # Filter out words that are not in the vocabulary of the Word2Vec model
    words_in_vocab = [word for word in words if word in word2vec_model.wv]
    if len(words_in_vocab) > 0:
        return np.mean([word2vec_model.wv[word] for word in words_in_vocab], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)  # Return zero vector if no words in vocabulary

X_features = X.apply(get_average_embedding)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features.tolist(), y, test_size=0.2, random_state=42)

After processing all our words using our custom preprocessing code we clear all the words out that are not within our word2vec model based on processed text.

Then, we calculate average word embedding per essay this tells us what words/features represent using dense vectors of real numbers. Each dimension of the vector represents a feature of the word.

## Linear Regression Model
Create and run the model on **training** data

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict scores on test set
y_pred = model.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Submission Generator
Running the model on **test** data

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

test_df = pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")

# ---- PreProcessing Our Test Data ----

# We only need this feature to process our data and apply the model.
test_df['processed_text'] = test_df['full_text'].apply(comprehensive_text_preprocessing)

# Using our prior Word2Vec model process our data and make predictions using the model

X = test_df['processed_text']

# Calculate the embeddings as done before
def get_average_embedding(text):
    words = comprehensive_text_preprocessing(text)
    # Filter out words that are not in the vocabulary of the Word2Vec model
    words_in_vocab = [word for word in words if word in word2vec_model.wv]
    if len(words_in_vocab) > 0:
        return np.mean([word2vec_model.wv[word] for word in words_in_vocab], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)  # Return zero vector if no words in vocabulary

X_features = X.apply(get_average_embedding)

# Make predictions on the test set
y_pred_lin = model.predict(X_features.tolist())

# Create a DataFrame to store essay_id and predicted scores
predictions_df = pd.DataFrame({'essay_id': test_df['essay_id'], 'score': [int(np.round(x)) for x in y_pred_lin]})

# Output predictions to a CSV file
predictions_df.to_csv("/kaggle/working/submission.csv", index=None)

In [None]:
submission_data = pd.read_csv("/kaggle/working/submission.csv")

submission_data

Analyzing our results we see scores within region from 1-6 and these values make sense based on 