In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from wordcloud import WordCloud
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
vectorizer = TfidfVectorizer()
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import catboost as cb
from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
data1 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
data2 = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')

In [None]:
data.head(5)

In [None]:
def preprocess_text(text):
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    filtered_tokens = [word for word in tokens if word not in stop_words]
    processed_text = ' '.join(filtered_tokens)
    return processed_text

In [None]:
def preprocess_dataframe(df):
    df.drop(columns=['essay_id'], inplace=True)
    df['full_text'] = df['full_text'].apply(preprocess_text)
    return df

In [None]:
df = preprocess_dataframe(data)

In [None]:
df.head(5)

In [None]:
text = ' '.join(df['full_text'])


wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
words = text.split()
word_counts = Counter(words)
top_10_words = word_counts.most_common(10)
top_words, word_counts = zip(*top_10_words)

plt.figure(figsize=(10, 6))
bars = plt.bar(top_words, word_counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Counts')
plt.title('Top 10 Most Common Words')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
for bar, count in zip(bars, word_counts):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.1, str(count), ha='center', va='bottom')

plt.show()

In [None]:
x = df['full_text']
y = df['score']

In [None]:
X = vectorizer.fit_transform(x)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "ElasticNet Regression": ElasticNet(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBoost": XGBRegressor(),
    "LightGBM": LGBMRegressor(),
    "CatBoost": cb.CatBoostRegressor(silent=True)  # silent=True để ẩn các thông báo
}

In [None]:
predictions = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions[name] = model.predict(X_test)

In [None]:
rmse_values = {}
for name, prediction in predictions.items():
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    rmse_values[name] = rmse

In [None]:
df_result = pd.DataFrame(list(rmse_values.items()), columns=['Model', 'RMSE'])

In [None]:
df_result