In [3]:
import pandas as pd
import numpy as np

df = pd.read_json(r"C:\Users\dhanu\OneDrive\Desktop\autojudge project\problems_data.jsonl", lines = True)
df.head(5)

Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Žofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3 3 C 2 C 1 C', 'output': 'GHOST...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0 10 0 10 10', 'output': '14.14'...",hard,9.6,https://open.kattis.com/problems/barktree


In [4]:
def combine_text(row):
    return (
        str(row["title"]) + " " +
        str(row["description"]) + " " +
        str(row["input_description"]) + " " +
        str(row["output_description"]) + " "
    )

df["full_text"] = df.apply(combine_text, axis = 1)

In [5]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["full_text"] = df["full_text"].apply(clean_text)


In [6]:
x_text = df["full_text"]
y_class = df["problem_class"]
y_score = df["problem_score"]

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features = 5000,
    ngram_range = (1, 2),
    stop_words = "english"
)

x_tfidf = tfidf.fit_transform(x_text)

In [8]:
def length_features(row):
    return {
        "total_length": len(row["full_text"]),
        "desc_length": len(str(row["description"])),
        "input_length": len(str(row["input_description"])),
        "output_length": len(str(row["output_description"]))
    }

def symbol_features(text):
    symbols = ['+', '-', '*', '/', '%', '=', '<', '>']
    return {
        "num_digits": sum(c.isdigit() for c in text),
        "num_symbols": sum(text.count(s) for s in symbols)
    }



In [9]:
feature_rows = []

for _, row in df.iterrows():
    feats = {}
    
    feats.update(length_features(row))
    feats.update(symbol_features(row["full_text"]))
    
    feature_rows.append(feats)

x_extra = pd.DataFrame(feature_rows)

In [10]:
from scipy.sparse import hstack

x = hstack([x_tfidf, x_extra.values])


In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split( x, y_class, y_score, test_size = 0.2, random_state = 42) 

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

clf = LogisticRegression(max_iter = 1000)
clf.fit(x_train, y_class_train)

y_class_pred = clf.predict(x_test)

print("Classification Accuracy:", accuracy_score(y_class_test, y_class_pred))
print("Confusion Matrix:\n", confusion_matrix(y_class_test, y_class_pred))

Classification Accuracy: 0.5127582017010935
Confusion Matrix:
 [[ 46  26  64]
 [ 25 268 132]
 [ 20 134 108]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

reg = RandomForestRegressor(n_estimators = 200, random_state = 42)

reg.fit(x_train, y_score_train)

y_score_pred = reg.predict(x_test)

In [14]:
print("MAE:", mean_absolute_error(y_score_test, y_score_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_score_test, y_score_pred)))

MAE: 1.667886998784933
RMSE: 2.017098949615385


In [15]:
import joblib
import os

os.makedirs("models", exist_ok=True)

joblib.dump(tfidf, "models/tfidf.pkl")
joblib.dump(clf, "models/classifier.pkl")
joblib.dump(reg, "models/regressor.pkl")

['models/regressor.pkl']