### Import Libraries

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

### Importing Dataset

In [20]:
# load .csv file and list first 5 rows
data = pd.read_csv("../data/test.csv")
data.head()
print(data.columns.tolist())


['submission_id', 'topic', 'difficulty_1to5', 'steps_count', 'steps_completeness', 'reasoning_quality', 'method_appropriateness', 'representation_use', 'explanation_clarity', 'units_handling', 'edge_case_handling', 'language_quality', 'computational_errors', 'conceptual_errors', 'correctness_pct', 'time_minutes', 'external_aid_suspected', 'originality_score', 'rubric_points', 'grade', 'teacher_comment_sv']


### Clean and preprocess text

In [21]:
# Clean teacher comments
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-zÃ¥Ã¤Ã¶0-9\s.,!?-]", "", text)
    return text.strip()

data["teacher_comment_clean"] = data["teacher_comment_sv"].apply(clean_text)

# âœ… Features to use (all numeric columns + grade)
feature_cols = [
    "topic", "difficulty_1to5", "steps_count", "steps_completeness", 
    "reasoning_quality", "method_appropriateness", "representation_use",
    "explanation_clarity", "units_handling", "edge_case_handling",
    "language_quality", "computational_errors", "conceptual_errors",
    "correctness_pct", "time_minutes", "external_aid_suspected",
    "originality_score", "rubric_points"
]

# Encode grade
from sklearn.preprocessing import LabelEncoder
grade_encoder = LabelEncoder()
data["grade_encoded"] = grade_encoder.fit_transform(data["grade"])

# Final feature matrix
X = data[feature_cols + ["grade_encoded"]]
y = data["teacher_comment_clean"]

print("âœ… Features shape:", X.shape)
print("ðŸ“˜ Example teacher comment:", y.iloc[0])


âœ… Features shape: (1186, 19)
ðŸ“˜ Example teacher comment: vÃ¤l fÃ¶rklarat och lÃ¤tt att fÃ¶lja.


In [2]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords

data = pd.read_csv("../data/test.csv")

# âœ… Ensure text fields exist
print(data.columns)

# Replace NaNs
data["teacher_comment_sv"] = data["teacher_comment_sv"].fillna("")
data["grade"] = data["grade"].fillna("F")

# Use Swedish stopwords from NLTK
swedish_stopwords = stopwords.words("swedish")

# --- Train a TF-IDF model on teacher comments ---
tfidf = TfidfVectorizer(max_features=3000, stop_words=swedish_stopwords)
X_tfidf = tfidf.fit_transform(data["teacher_comment_sv"])

# --- Fit a Nearest Neighbors model to find similar feedback ---
model = NearestNeighbors(n_neighbors=1, metric="cosine")
model.fit(X_tfidf)

# --- Save both models ---
import os
os.makedirs("model", exist_ok=True)
joblib.dump(model, "model/teacher_feedback_model.pkl")
joblib.dump(tfidf, "model/teacher_feedback_vectorizer.pkl")

print("âœ… Teacher feedback model trained and saved successfully!")


Index(['submission_id', 'topic', 'difficulty_1to5', 'steps_count',
       'steps_completeness', 'reasoning_quality', 'method_appropriateness',
       'representation_use', 'explanation_clarity', 'units_handling',
       'edge_case_handling', 'language_quality', 'computational_errors',
       'conceptual_errors', 'correctness_pct', 'time_minutes',
       'external_aid_suspected', 'originality_score', 'rubric_points', 'grade',
       'teacher_comment_sv'],
      dtype='object')
âœ… Teacher feedback model trained and saved successfully!


### Next Step â€” Train the Model

# Load and inspect dataset
