# Question Difficulty Classification Model Training

This notebook trains a Logistic Regression model to classify question difficulty based on the question body text.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import job os
import joblib
import re

## 1. Load Data

In [None]:
df = pd.read_csv('../data/processed/processed_questions.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Preprocessing

We only need `question_body` and `difficulty_label`.

In [None]:
df = df[['question_body', 'difficulty_label']]

# Basic cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    # Remove special characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Lowercase
    text = text.lower().strip()
    return text

df['clean_body'] = df['question_body'].apply(clean_text)
df.head()

## 3. Vectorization & Split

In [None]:
X = df['clean_body']
y = df['difficulty_label']

# Split 70-30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize with stop words exclusion
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

## 4. Model Training

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")

## 5. Evaluation

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 6. Save Model

In [None]:
os.makedirs('../models', exist_ok=True)
joblib.dump(model, '../models/model.joblib')
joblib.dump(vectorizer, '../models/vectorizer.joblib')
print("Model and Vectorizer saved successfully!")