In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
train_df = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
print("DataFrame shape:", train_df.shape)

In [None]:
print(train_df.head())

In [None]:
# Examine the data types and missing values
print("\nData types and missing values:")
print(train_df.info())

In [None]:
# Analyze the essay scores
print("\nEssay score distribution:")
print(train_df['score'].value_counts())

In [None]:
# Visualize the score distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='score', data=train_df)
plt.title("Essay Score Distribution")
plt.xlabel("Score")
plt.ylabel("Count")
plt.show()

In [None]:
print("Essay score summary statistics:")
print(train_df['score'].describe())

In [None]:
train_df['essay_length'] = train_df['full_text'].apply(len)
print("\nEssay length distribution:")
print(train_df['essay_length'].describe())

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data=train_df, x='essay_length', bins=50)
plt.title("Essay Length Distribution")
plt.xlabel("Length")
plt.ylabel("Count")
plt.show()

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(tokens)

In [None]:
train_df['preprocessed_text'] = train_df['full_text'].apply(preprocess_text)
train_df['word_count'] = train_df['preprocessed_text'].apply(lambda x: len(x.split()))

In [None]:
# Identify the most frequent words
vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(train_df['preprocessed_text'])
word_freq = pd.DataFrame(word_counts.sum(axis=0), columns=vectorizer.get_feature_names_out(), index=['Frequency'])
word_freq = word_freq.T.sort_values(by='Frequency', ascending=False)
print("Most frequent words:")
print(word_freq.head(10))

In [None]:
# Create a word cloud visualization
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(train_df['preprocessed_text']))
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud")
plt.show()

In [None]:
# Relationship between essay length and score
plt.figure(figsize=(8, 6))
sns.scatterplot(x='word_count', y='score', data=train_df)
plt.title("Essay Length vs. Score")
plt.xlabel("Word Count")
plt.ylabel("Score")
plt.show()

In [None]:
correlation = train_df['word_count'].corr(train_df['score'])
print(f"\nCorrelation between essay length and score: {correlation:.2f}")

In [None]:
# Topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(word_counts)

In [None]:
topic_keywords = []
for topic in lda.components_:
    top_keywords = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]]
    topic_keywords.append(top_keywords)

In [None]:
print("\nTopic modeling results:")
for i, keywords in enumerate(topic_keywords):
    print(f"Topic {i+1}: {', '.join(keywords)}")

In [None]:
# Sentiment analysis
train_df['sentiment'] = train_df['full_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
print("\nSentiment analysis results:")
print(train_df['sentiment'].describe())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = train_df['preprocessed_text']
y = train_df['score']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=1000,
    task_type="GPU",
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    random_seed=42,
    verbose=True
)

In [None]:
# Train the CatBoost model
catboost_model.fit(X_train_tfidf, y_train)

In [None]:
# Save the trained CatBoost model to disk
catboost_model.save_model('catboost_model.cbm')

# Save the TF-IDF vectorizer to disk
import joblib
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')