In [30]:
import pandas as pd
import spacy

In [31]:
df=pd.read_json("/content/problems_data_cleaned.jsonl",lines=True)

In [32]:
pd.set_option("display.max_colwidth", None)

In [33]:
df.shape

(4112, 3)

In [34]:
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        # Keep non-stopwords and lemmatize
        if not token.is_stop and not token.is_punct:
            tokens.append(token.lemma_)
    return " ".join(tokens)

In [35]:
df['processed'] = df['txt'].apply(preprocess)

In [36]:
row = df.sample(1).index[0]
df[["processed", "txt"]].iloc[row]


Unnamed: 0,2280
processed,benelux artistic pottery consortium prepare exhibit prize urn vas gallery nijmegen sheer number vas display gallery trouble find pedestal right size single vase pedestal available place normally upside characterise diameter surface diameter varie unit length artistic reason important diameter base vase match diameter surface pedestal place ask find way place vas available pedestal order work need turn pedestal upside example figure 1 show possible assignment pedestal vas sample input 1 assist gallery write program compute assignment line contain integer 0 leq p v leq 10 4 number pedestal number vas output v distinct integer 1 leq x 1 dotsc x v leq p vase stand pedestal x print impossible assignment vas pedestal exist
txt,the benelux artistic pottery consortium is preparing for an exhibit of its most prized urns and vases at a gallery in nijmegen due to the sheer number of vases to be put on display the gallery has trouble finding a pedestal of the right size for every single vase they have pedestals available that can either be placed normally or upside down and can be characterised by the diameter of their top and bottom surface moreover the diameter of the top and bottom varies by at most one unit length for artistic reasons it is important that the diameter of the base of a vase matches the diameter of the surface of the pedestal it is placed on you have been asked to find a way to place all the vases on available pedestals in order to make this work you might need to turn some of the pedestals upside down for example figure 1 shows a possible assignment of pedestals to vases for sample input 1 assist the gallery by writing a program to compute such an assignment the first line contains two integers 0 leq p v leq 10 4 the number of pedestals and the number of vases output v distinct integers 1 leq x 1 dotsc x v leq p such that vase i can stand on pedestal x i or print impossible if no assignment of vases to pedestals exists


In [50]:
label_map = {
    'hard': 0,
    'medium': 1,
    'easy': 2
}

df['label_num'] = df['problem_class'].map(label_map)



In [51]:
df = df[['txt', 'problem_class', 'label_num']].copy()


In [52]:
df['char_len'] = df['txt'].str.len()
df['word_len'] = df['txt'].str.split().str.len()


In [53]:
df.head(3)

Unnamed: 0,txt,problem_class,label_num,char_len,word_len
0,unununium uuu was the name of the chemical element with atom number 111 until it changed to rontgenium rg in 2004 these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find1 after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n 100 and m 500 the output consists of m lines where the i th contains two integers u i and v i 1 leq u i v i leq n this indicates that the vertices u i and v i are connected with an edge in your graph,hard,0,1562,284
1,a number of eccentrics from central new york have decided that they have had enough of modern society and want to move from there together they have bought a rectangular piece of land far away and will now settle there the land consists of n times m squares and it is possible to build a maximum of one house on a given square each square has value a x y that describes how nice it is on a scale between 0 and 100 the goal of the eccentrics is to get as far away as possible from everyone else including each other the happiness an eccentric experiences from building his house on square x y is thus a x y cdot d where d is the smallest distance to another person out of habit the eccentrics use manhattan distance to measure this d is defined as min x x 2 y y 2 over all other peoples squares x 2 y 2 the eccentrics now want your help in placing their houses optimally so that the sum of the happiness they experience is as high as possible can you help them the input consists of 10 test cases which are described below print k lines with the positions of the houses each line should contain two numbers first the row for the house between 1 and n then the column between 1 and m two houses may not be placed at the same position,hard,0,1313,244
2,mario and luigi are playing a game where they pick distinct numbers m l 0 leq m l 2 10 18 in order to place careful bets on the outcome of the game you wish to know whose number is larger both mario and luigi have already shared their secret numbers with their close friend toadette who has memorized both of their numbers as binary numbers with 10 18 digits so you decide to go to toadette for help fortunately toadette is willing to help you and lets you ask her questions of one of two following types give two integers a and b and ask if you write out m and l in binary are m s bits in the inclusive range a b equal to l s bits in the same range toadette responds yes or no give an integer x and ask is the x th bit of m or l greater toadette responds with mario luigi or equal however toadette is afraid that her answers to questions of the first type gives you too much information so she decides to make things interesting each time you ask a question of the first type she will lie to you independently and randomly with probability frac 1 13 can you find out whose number is larger by asking at most 192 questions,hard,0,1195,224


In [54]:
df.groupby('label_num')['word_len'].agg(
    mean='mean',
    median='median',
    min='min',
    max='max'
)


Unnamed: 0_level_0,mean,median,min,max
label_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,298.069037,279.0,37,1130
1,273.535231,254.0,23,1281
2,217.439948,206.0,0,687


In [56]:
X = df["txt"]
y = df["label_num"]


In [57]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   # IMPORTANT because imbalance exists
)


In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 5),
    min_df=3,
    max_df=0.9
)

X_train_vec = tfidf.fit_transform(X_train)
X_val_vec = tfidf.transform(X_val)


AttributeError: 'csr_matrix' object has no attribute 'lower'

In [69]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

model.fit(X_train_vec, y_train)


In [70]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_val_vec)

print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.47      0.73      0.57       389
           1       0.35      0.26      0.30       281
           2       0.17      0.01      0.01       153

    accuracy                           0.43       823
   macro avg       0.33      0.33      0.29       823
weighted avg       0.37      0.43      0.37       823



In [78]:
import re

# Word length
df['word_len'] = df['txt'].str.split().str.len()

# Character length
df['char_len'] = df['txt'].str.len()

# Math symbol count (+ - * |)
df['math_symbol_count'] = df['txt'].str.count(r"[+\-*|]")

# Keyword count (example keywords: 'graph', 'dp', 'recursion')
keywords = ['tree', 'dp', 'graph', 'queue', 'stack', 'recursion', 'dfs', 'bfs']
df['keyword_count'] = df['txt'].apply(lambda x: sum(x.lower().count(k) for k in keywords))


In [79]:
from sklearn.preprocessing import StandardScaler

numeric_features = df[['word_len', 'char_len', 'math_symbol_count', 'keyword_count']]
scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features)


In [80]:
from scipy.sparse import hstack

# Assume you already have tfidf_matrix from TF-IDF vectorizer
X = hstack([tfidf_matrix, numeric_features_scaled])
y = df['label_num']


In [81]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.65      0.53      0.59       425
           1       0.38      0.40      0.39       262
           2       0.40      0.60      0.48       136

    accuracy                           0.50       823
   macro avg       0.48      0.51      0.48       823
weighted avg       0.52      0.50      0.51       823



In [76]:
'''import numpy as np

feature_names = tfidf.get_feature_names_out()
classes = model.classes_

for i, cls in enumerate(classes):
    top = np.argsort(model.coef_[i])[-15:]
    print(f"\nTop words for {cls}:")
    print([feature_names[j] for j in top])'''


'import numpy as np\n\nfeature_names = tfidf.get_feature_names_out()\nclasses = model.classes_\n\nfor i, cls in enumerate(classes):\n    top = np.argsort(model.coef_[i])[-15:]\n    print(f"\nTop words for {cls}:")\n    print([feature_names[j] for j in top])'

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use the 'processed' column from the cleaned pipeline
texts = df['txt'].tolist()

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,   # pick top 5000 words
    ngram_range=(1,2),   # unigrams + bigrams
    lowercase=False      # already handled in preprocessing
)

# Fit and transform your text
tfidf_matrix = vectorizer.fit_transform(texts)

# Convert to DataFrame if you want
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())


In [38]:
import sys
!{sys.executable} -m pip install gensim

from gensim.models import Word2Vec
import gensim

# Tokenize your processed text
tokenized_texts = [text.split() for text in df['processed']]

# Train a Word2Vec model
w2v_model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=100,   # embedding dimension
    window=5,          # context window
    min_count=1,       # include all words
    workers=4,
    sg=1               # skip-gram model
)

# Example: get embedding for a word
word_vector = w2v_model.wv['graph']  # vector for 'graph'

# Optional: average embedding for a text
import numpy as np
def avg_embedding(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

df['embedding'] = [avg_embedding(tokens, w2v_model) for tokens in tokenized_texts]



In [39]:
from gensim.models import FastText

ft_model = FastText(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1)


In [86]:
from sklearn.model_selection import train_test_split

# Assuming 'df' has all your columns: 'txt', 'label_num', numeric features...
X = df  # all features including numeric and 'txt'
y = df['label_num']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Check type
print(type(X_train))
print(X_train.columns)


<class 'pandas.core.frame.DataFrame'>
Index(['txt', 'problem_class', 'label_num', 'char_len', 'word_len',
       'math_symbol_count', 'keyword_count'],
      dtype='object')


In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,5),
    min_df=3,
    max_df=0.9
)

# Make sure we pass a Series of strings, not the whole DataFrame
X_train_vec = tfidf.fit_transform(X_train['txt'].astype(str))
X_val_vec = tfidf.transform(X_val['txt'].astype(str))

from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

numeric_cols = ['word_len', 'char_len', 'math_symbol_count', 'keyword_count']

numeric_train = X_train[numeric_cols].values
numeric_val = X_val[numeric_cols].values

scaler = StandardScaler()
numeric_train_scaled = scaler.fit_transform(numeric_train)
numeric_val_scaled = scaler.transform(numeric_val)

# Combine sparse TF-IDF + dense numeric
X_train_final = hstack([X_train_vec, numeric_train_scaled])
X_val_final = hstack([X_val_vec, numeric_val_scaled])



In [88]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Random Forest
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=30,
                                class_weight='balanced', n_jobs=-1, random_state=42)
rf_clf.fit(X_train_final, y_train)
y_pred_rf = rf_clf.predict(X_val_final)
print("Random Forest Results:")
print(classification_report(y_val, y_pred_rf))

# Linear SVM
svm_clf = LinearSVC(class_weight='balanced', max_iter=5000, random_state=42)
svm_clf.fit(X_train_final, y_train)
y_pred_svm = svm_clf.predict(X_val_final)
print("Linear SVM Results:")
print(classification_report(y_val, y_pred_svm))


Random Forest Results:
              precision    recall  f1-score   support

           0       0.56      0.85      0.67       389
           1       0.48      0.21      0.29       281
           2       0.51      0.37      0.43       153

    accuracy                           0.54       823
   macro avg       0.52      0.47      0.46       823
weighted avg       0.52      0.54      0.50       823

Linear SVM Results:
              precision    recall  f1-score   support

           0       0.55      0.53      0.54       389
           1       0.35      0.33      0.34       281
           2       0.42      0.48      0.45       153

    accuracy                           0.46       823
   macro avg       0.44      0.45      0.44       823
weighted avg       0.46      0.46      0.46       823



In [84]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf_clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=30,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
rf_clf.fit(X_train_final, y_train)
y_pred_rf = rf_clf.predict(X_val_final)

print("Random Forest Results:")
print(classification_report(y_val, y_pred_rf))


NameError: name 'X_train_final' is not defined

In [85]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(
    class_weight='balanced',
    max_iter=5000,
    random_state=42
)
svm_clf.fit(X_train_final, y_train)
y_pred_svm = svm_clf.predict(X_val_final)

print("Linear SVM Results:")
print(classification_report(y_val, y_pred_svm))


NameError: name 'X_train_final' is not defined