In [3]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from joblib import dump

# Data cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text, flags=re.I)  # Replace multiple spaces with a single space
    text = re.sub(r'\d', '', text)  # Remove digits
    return text

# Function to load the dataset, train the model, and save it
def train_and_save_model(file_path='training_set_rel3.xlsx', model_path='essay_scoring_model.joblib'):
    # Load the dataset
    data = pd.read_excel(file_path)
    data['essay'] = data['essay'].apply(clean_text)
    data = data.dropna(subset=['domain1_score'])
    
    # Set up the feature extraction and model training pipeline
    vectorizer = TfidfVectorizer(min_df=5, max_df=0.95, max_features=3000)
    X = data['essay']
    y = data['domain1_score'].astype(float)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the model pipeline
    model = Pipeline([
        ('vectorizer', vectorizer),
        ('regressor', LinearRegression())
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the trained model
    dump(model, model_path)
    print(f"Model saved to {model_path}")

# Specify the path to your dataset and where you want to save the model
dataset_path = 'training_set_rel3.xlsx'  # Update this path
model_save_path = 'essay_scoring_model.joblib'

# Train and save the model
train_and_save_model(dataset_path, model_save_path)


Model saved to essay_scoring_model.joblib
