#**PRE-PROCESSING**

**SUMMARY**
1. Make a single text column combining desc,inout desc,output desc

In [70]:
import pandas as pd

df=pd.read_json("/content/problems_data.jsonl",lines=True)
df.shape

(4112, 8)

In [71]:
#removed title,url,sample_io
text_cols = ["description","input_description","output_description"]

df_text = df[text_cols + ["problem_class", "problem_score"]]
df_text.loc[0]

Unnamed: 0,0
description,"Unununium (Uuu) was the name of the chemical\n element with atom number 111, until it changed to\n Röntgenium (Rg) in 2004. These heavy elements are very\n unstable and have only been synthesized in a few\n laboratories.\nYou have just been hired by one of these labs to optimize\n the algorithms used in simulations. For example, when\n simulating complicated chemical reactions, it is important to\n keep track of how many particles there are, and this is done by\n counting connected components in a graph.\nCurrently, the lab has some Python code (see attachments)\n that takes an undirected graph and outputs the number of\n connected components. As you can see, this code is based on\n everyone’s favourite data structure union-find1.\nAfter looking at the code for a while, you notice that it\n actually has a bug in it! The code still gives correct answers,\n but the bug could cause it to run inefficiently. Your task is\n to construct a graph with a given number of vertices and edges\n where the code runs very slowly. We will count how many times\n the third line (the one inside the while loop) is visited, and\n your program will get a score according to this number.\n"
input_description,"The input consists of one line with two integers\n $N$ and $M$, the number of vertices and edges\n your graph should have. Apart from the sample, there will be\n only one test case, with $N =\n 100$ and $M =\n 500$."
output_description,"The output consists of $M$ lines where the $i$:th contains two integers\n $u_ i$ and $v_ i$ ($1 \leq u_ i, v_ i \leq N$). This\n indicates that the vertices $u_\n i$ and $v_ i$ are\n connected with an edge in your graph."
problem_class,hard
problem_score,9.7


In [72]:
pd.set_option("display.max_colwidth", None) #comment out if u dont want to see the whole content of the column u get ... , i prefer looking at the whole content.

In [73]:
#we have df_text dataframe which shall have multiple columns where we can test and compare before and after cleaning.
#we have df dataframe which shall contain the final cleaned version of everything.
#i would like to merge the three columns in the end after cleaning.

In [74]:
import re
import unicodedata

def clean_text(text):
    if text is None:
        return ""
    if not isinstance(text, str):
        return ""

    # 1. Normalize unicode (ö → o, é → e)
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("utf-8")
    text = text.lower()  # lowercase


    # 2. REPLACE LATEX → UNICODE AFTER ascii/lowercase (preserves ≤ as math symbols have significance)
    replacements = {
        r'\\leq': '≤', r'\\geq': '≥', r'\\lt': '<', r'\\gt': '>',
        r'\\le': '≤', r'\\ge': '≥', r'\\neq': '≠', r'\\approx': '≈',r'\\cdot': ' × '
    }
    for latex, sym in replacements.items():
        text = re.sub(latex, sym, text)

    text = re.sub(r'(\d+)([,\\\s]+)(\d+)', r'\1\3', text)


    # 3. Punctuation
    text = re.sub(r"[()\n!,_:'\"$.?{}\\/]", " ", text)


    # 4. Hyphens AFTER symbols
    text = re.sub(r"(?<=[a-z][a-z])-(?=[a-z][a-z])", " ", text)
    text = re.sub(r"\\cdot", " ", text)

    # 5. Removes whitespace
    text = " ".join(text.split()).strip()  # OR: re.sub(r'\s+', ' ', text).strip()

    # 6. Replace numbers with num so numbers dont take much columns in tfidf
    text = re.sub(r'\d+', ' num ', text)

    # 7. remove patterns like ababa, ababab, aaaa - as they give no meaning in the problem
    text = re.sub(r'\b([a-z]{1,3})\1+\b', ' ', text)

    # 8. Remove alphabet dummy sequences
    text = re.sub(r'\babc(def)?\b', ' ', text)

    return text


In [75]:
#the below step shall clean all the texts.

In [76]:
cols_to_clean = [
    "description",
    "input_description",
    "output_description"
]

for col in cols_to_clean:
    df_text[col] = df_text[col].apply(clean_text)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text[col] = df_text[col].apply(clean_text)


In [77]:
df_text.loc[0]

Unnamed: 0,0
description,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number
input_description,the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num
output_description,the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph
problem_class,hard
problem_score,9.7


In [78]:
import numpy as np

In [79]:
#check if whitespace gone
pd.DataFrame({
    'nulls': df_text[['description','input_description','output_description']].isna().sum(),
    'empty_strings': (df_text[['description','input_description','output_description']] == "").sum(),
    'fake_null': (df_text[['description','input_description','output_description']].isin(['none','nan','null'])).sum()
})

Unnamed: 0,nulls,empty_strings,fake_null
description,0,81,0
input_description,0,120,0
output_description,0,131,0


In [80]:
import pandas as pd

# This finds rows where any of the three columns are empty or just spaces
is_empty = df_text[
    (df_text['description'].str.strip() == "") |
    (df_text['input_description'].str.strip() == "") |
    (df_text['output_description'].str.strip() == "")
]

print("Rows with empty strings:")
list_empty=is_empty.index.tolist()
print(list_empty)

# Use .loc to grab those specific rows and then group them
empty_counts = df_text.loc[list_empty].groupby('problem_class').size()

print("\nCounts by problem class:")
print(empty_counts)

Rows with empty strings:
[2, 5, 6, 13, 14, 15, 26, 37, 39, 47, 55, 56, 58, 66, 95, 105, 111, 112, 164, 213, 217, 317, 322, 370, 387, 392, 398, 416, 447, 455, 497, 507, 523, 540, 541, 550, 558, 603, 668, 686, 687, 731, 770, 793, 798, 841, 855, 868, 939, 971, 990, 1085, 1097, 1128, 1135, 1141, 1149, 1169, 1253, 1280, 1296, 1313, 1333, 1349, 1358, 1367, 1368, 1418, 1446, 1454, 1463, 1496, 1501, 1525, 1557, 1584, 1588, 1594, 1595, 1603, 1630, 1643, 1646, 1656, 1659, 1726, 1737, 1742, 1815, 1816, 1844, 1848, 1870, 1881, 1883, 1899, 1944, 1961, 1963, 1986, 1993, 1994, 2006, 2014, 2020, 2043, 2083, 2102, 2118, 2203, 2211, 2245, 2252, 2263, 2266, 2363, 2374, 2375, 2401, 2408, 2420, 2421, 2448, 2473, 2474, 2503, 2511, 2527, 2557, 2564, 2566, 2568, 2570, 2576, 2580, 2588, 2594, 2598, 2618, 2625, 2712, 2731, 2760, 2764, 2767, 2804, 2832, 2835, 2877, 2880, 2893, 2932, 2938, 2954, 2955, 3001, 3010, 3037, 3038, 3045, 3046, 3062, 3090, 3098, 3117, 3167, 3180, 3193, 3235, 3254, 3255, 3256, 3258, 3338,

In [81]:
import pandas as pd

# 1. Your list of indices
indices_list = [2, 5, 6, 13, 14, 15, 26, 37, 39, 47, 55, 56, 58, 66, 95, 105, 111, 112, 164, 213, 217, 317, 322, 370, 387, 392, 398, 416, 447, 455, 497, 507, 523, 540, 541, 550, 558, 603, 668, 686, 687, 731, 770, 793, 798, 841, 855, 868, 939, 971, 990, 1085, 1097, 1128, 1135, 1141, 1149, 1169, 1253, 1280, 1296, 1313, 1333, 1349, 1358, 1367, 1368, 1418, 1446, 1454, 1463, 1496, 1501, 1525, 1557, 1584, 1588, 1594, 1595, 1603, 1630, 1643, 1646, 1656, 1659, 1726, 1737, 1742, 1815, 1816, 1844, 1848, 1870, 1881, 1883, 1899, 1944, 1961, 1963, 1986, 1993, 1994, 2006, 2014, 2020, 2043, 2083, 2102, 2118, 2203, 2211, 2245, 2252, 2263, 2266, 2363, 2374, 2375, 2401, 2408, 2420, 2421, 2448, 2473, 2474, 2503, 2511, 2527, 2557, 2564, 2566, 2568, 2570, 2576, 2580, 2588, 2594, 2598, 2618, 2625, 2712, 2731, 2760, 2764, 2767, 2804, 2832, 2835, 2877, 2880, 2893, 2932, 2938, 2954, 2955, 3001, 3010, 3037, 3038, 3045, 3046, 3062, 3090, 3098, 3117, 3167, 3180, 3193, 3235, 3254, 3255, 3256, 3258, 3338, 3350, 3379, 3392, 3413, 3426, 3431, 3471, 3534, 3535, 3550, 3568, 3590, 3627, 3636, 3670, 3671, 3691, 3783, 3813, 3826, 3884, 3885, 3898, 3942, 3965, 3973, 3983, 4008, 4028, 4032, 4049, 4076, 4079, 4088, 4095, 4097, 4103, 4109, 4110]

# 2. Extract the specific rows
detailed_audit = df_text.loc[indices_list].copy()

# 3. Create a function to check individual cell status
def check_cell(val):
    if str(val).strip() == "":
        return "Empty String"
    else:
        return "Has Text"

# 4. Apply the check to each of the three columns
detailed_audit['desc_status'] = detailed_audit['description'].apply(check_cell)
detailed_audit['input_status'] = detailed_audit['input_description'].apply(check_cell)
detailed_audit['output_status'] = detailed_audit['output_description'].apply(check_cell)

# 5. Keep only the columns that matter for the report
# This includes the original index and the problem class
final_df = detailed_audit.reset_index()[['index', 'problem_class', 'desc_status', 'input_status', 'output_status']]

final_df.head(10)

Unnamed: 0,index,problem_class,desc_status,input_status,output_status
0,2,hard,Has Text,Empty String,Empty String
1,5,hard,Has Text,Empty String,Empty String
2,6,hard,Has Text,Has Text,Empty String
3,13,hard,Empty String,Has Text,Has Text
4,14,hard,Has Text,Empty String,Empty String
5,15,hard,Has Text,Empty String,Empty String
6,26,hard,Has Text,Empty String,Empty String
7,37,hard,Has Text,Empty String,Empty String
8,39,hard,Has Text,Has Text,Empty String
9,47,hard,Has Text,Has Text,Empty String


In [42]:
# 1. Use the indices you already identified
indices_list = [2, 5, 6, 13, 14, 15, 26, 37, 39, 47, 55, 56, 58, 66, 95, 105, 111, 112, 164, 213, 217, 317, 322, 370, 387, 392, 398, 416, 447, 455, 497, 507, 523, 540, 541, 550, 558, 603, 668, 686, 687, 731, 770, 793, 798, 841, 855, 868, 939, 971, 990, 1085, 1097, 1128, 1135, 1141, 1149, 1169, 1253, 1280, 1296, 1313, 1333, 1349, 1358, 1367, 1368, 1418, 1446, 1454, 1463, 1496, 1501, 1525, 1557, 1584, 1588, 1594, 1595, 1603, 1630, 1643, 1646, 1656, 1659, 1726, 1737, 1742, 1815, 1816, 1844, 1848, 1870, 1881, 1883, 1899, 1944, 1961, 1963, 1986, 1993, 1994, 2006, 2014, 2020, 2043, 2083, 2102, 2118, 2203, 2211, 2245, 2252, 2263, 2266, 2363, 2374, 2375, 2401, 2408, 2420, 2421, 2448, 2473, 2474, 2503, 2511, 2527, 2557, 2564, 2566, 2568, 2570, 2576, 2580, 2588, 2594, 2598, 2618, 2625, 2712, 2731, 2760, 2764, 2767, 2804, 2832, 2835, 2877, 2880, 2893, 2932, 2938, 2954, 2955, 3001, 3010, 3037, 3038, 3045, 3046, 3062, 3090, 3098, 3117, 3167, 3180, 3193, 3235, 3254, 3255, 3256, 3258, 3338, 3350, 3379, 3392, 3413, 3426, 3431, 3471, 3534, 3535, 3550, 3568, 3590, 3627, 3636, 3670, 3671, 3691, 3783, 3813, 3826, 3884, 3885, 3898, 3942, 3965, 3973, 3983, 4008, 4028, 4032, 4049, 4076, 4079, 4088, 4095, 4097, 4103, 4109, 4110]

# 2. Extract these specific rows
error_df = df_text.loc[indices_list].copy()

# 3. Define a function to analyze each row
def analyze_row(row):
    # Check which columns are empty (handling spaces and NaN)
    empty_cols = []
    if str(row['description']).strip() == "": empty_cols.append('description')
    if str(row['input_description']).strip() == "": empty_cols.append('input_description')
    if str(row['output_description']).strip() == "": empty_cols.append('output_description')

    # Check if the row is entirely empty or has some text
    total_empty = len(empty_cols)
    if total_empty == 3:
        status = "Entirely Empty"
    else:
        status = f"Partial (Text in {3 - total_empty} box/es)"

    return pd.Series([", ".join(empty_cols), status])

# 4. Apply the analysis
error_df[['empty_in_columns', 'row_status']] = error_df.apply(analyze_row, axis=1)

# 5. Display the result grouped by problem class
final_report = error_df[['problem_class', 'empty_in_columns', 'row_status']]
print(final_report)

     problem_class                       empty_in_columns  \
2             hard  input_description, output_description   
5             hard  input_description, output_description   
6             hard                     output_description   
13            hard                            description   
14            hard  input_description, output_description   
...            ...                                    ...   
4095          easy                            description   
4097          easy                            description   
4103          easy                            description   
4109          easy                            description   
4110          easy                            description   

                      row_status  
2     Partial (Text in 1 box/es)  
5     Partial (Text in 1 box/es)  
6     Partial (Text in 2 box/es)  
13    Partial (Text in 2 box/es)  
14    Partial (Text in 1 box/es)  
...                          ...  
4095  Partial (Text in

In [83]:
for problem_class_value in final_df['problem_class'].unique():
    print(f"\n--- Problem Class: {problem_class_value.capitalize()} ---")
    display(final_df[final_df['problem_class'] == problem_class_value])


--- Problem Class: Hard ---


Unnamed: 0,index,problem_class,desc_status,input_status,output_status
0,2,hard,Has Text,Empty String,Empty String
1,5,hard,Has Text,Empty String,Empty String
2,6,hard,Has Text,Has Text,Empty String
3,13,hard,Empty String,Has Text,Has Text
4,14,hard,Has Text,Empty String,Empty String
...,...,...,...,...,...
91,1848,hard,Has Text,Empty String,Empty String
92,1870,hard,Has Text,Empty String,Empty String
93,1881,hard,Empty String,Has Text,Has Text
94,1883,hard,Empty String,Has Text,Has Text



--- Problem Class: Medium ---


Unnamed: 0,index,problem_class,desc_status,input_status,output_status
96,1944,medium,Has Text,Empty String,Empty String
97,1961,medium,Has Text,Empty String,Empty String
98,1963,medium,Has Text,Empty String,Empty String
99,1986,medium,Has Text,Empty String,Empty String
100,1993,medium,Has Text,Empty String,Empty String
...,...,...,...,...,...
169,3254,medium,Empty String,Has Text,Has Text
170,3255,medium,Empty String,Has Text,Has Text
171,3256,medium,Empty String,Has Text,Has Text
172,3258,medium,Has Text,Has Text,Empty String



--- Problem Class: Easy ---


Unnamed: 0,index,problem_class,desc_status,input_status,output_status
174,3350,easy,Has Text,Empty String,Empty String
175,3379,easy,Empty String,Has Text,Has Text
176,3392,easy,Has Text,Empty String,Empty String
177,3413,easy,Empty String,Has Text,Has Text
178,3426,easy,Empty String,Has Text,Has Text
179,3431,easy,Has Text,Empty String,Empty String
180,3471,easy,Has Text,Empty String,Empty String
181,3534,easy,Has Text,Empty String,Empty String
182,3535,easy,Empty String,Empty String,Empty String
183,3550,easy,Empty String,Has Text,Has Text


In [None]:
#as we can see most of the easy problem have no description at all so it would be very tough to train the model,
#so it very expected we maybe have less recall on easy problem

In [48]:
import pandas as pd

# 1. Your identified list of indices
indices_list = [2, 5, 6, 13, 14, 15, 26, 37, 39, 47, 55, 56, 58, 66, 95, 105, 111, 112, 164, 213, 217, 317, 322, 370, 387, 392, 398, 416, 447, 455, 497, 507, 523, 540, 541, 550, 558, 603, 668, 686, 687, 731, 770, 793, 798, 841, 855, 868, 939, 971, 990, 1085, 1097, 1128, 1135, 1141, 1149, 1169, 1253, 1280, 1296, 1313, 1333, 1349, 1358, 1367, 1368, 1418, 1446, 1454, 1463, 1496, 1501, 1525, 1557, 1584, 1588, 1594, 1595, 1603, 1630, 1643, 1646, 1656, 1659, 1726, 1737, 1742, 1815, 1816, 1844, 1848, 1870, 1881, 1883, 1899, 1944, 1961, 1963, 1986, 1993, 1994, 2006, 2014, 2020, 2043, 2083, 2102, 2118, 2203, 2211, 2245, 2252, 2263, 2266, 2363, 2374, 2375, 2401, 2408, 2420, 2421, 2448, 2473, 2474, 2503, 2511, 2527, 2557, 2564, 2566, 2568, 2570, 2576, 2580, 2588, 2594, 2598, 2618, 2625, 2712, 2731, 2760, 2764, 2767, 2804, 2832, 2835, 2877, 2880, 2893, 2932, 2938, 2954, 2955, 3001, 3010, 3037, 3038, 3045, 3046, 3062, 3090, 3098, 3117, 3167, 3180, 3193, 3235, 3254, 3255, 3256, 3258, 3338, 3350, 3379, 3392, 3413, 3426, 3431, 3471, 3534, 3535, 3550, 3568, 3590, 3627, 3636, 3670, 3671, 3691, 3783, 3813, 3826, 3884, 3885, 3898, 3942, 3965, 3973, 3983, 4008, 4028, 4032, 4049, 4076, 4079, 4088, 4095, 4097, 4103, 4109, 4110]

# 2. Extract problematic rows and reset index to make "Original Row" a column
audit_df = df_text.loc[indices_list].copy()
audit_df.index.name = 'original_row_index'
audit_df = audit_df.reset_index()

# 3. Logic to determine missing columns and content status
def get_audit_details(row):
    cols = ['description', 'input_description', 'output_description']
    # Find which of these columns are empty (handling NaN and whitespace)
    empty_cols = [c for c in cols if str(row[c]).strip() == ""]

    # Check if the row has any text at all in these three boxes
    if len(empty_cols) == 3:
        status = "Entirely Empty"
    else:
        status = "Partial Text Present"

    return pd.Series([", ".join(empty_cols), status])

# 4. Create the new columns
audit_df[['missing_columns', 'content_status']] = audit_df.apply(get_audit_details, axis=1)

# 5. Keep only the columns you need for the report
audit_report = audit_df[['original_row_index', 'problem_class', 'missing_columns', 'content_status']]

# Display the top of the new DataFrame
audit_report



Unnamed: 0,original_row_index,problem_class,missing_columns,content_status
0,2,hard,"input_description, output_description",Partial Text Present
1,5,hard,"input_description, output_description",Partial Text Present
2,6,hard,output_description,Partial Text Present
3,13,hard,description,Partial Text Present
4,14,hard,"input_description, output_description",Partial Text Present
...,...,...,...,...
208,4095,easy,description,Partial Text Present
209,4097,easy,description,Partial Text Present
210,4103,easy,description,Partial Text Present
211,4109,easy,description,Partial Text Present


In [21]:
for col in text_cols:
    # Create row mask for this column
    mask = df_text[col].str.strip() == ''
    df_text.loc[mask, col] = np.nan


#for some reason empty strings were not going away so i converted them into null and now i shall handle them,
#the empty strings are mostly no input or ouput description, so giving them a word, makes tfidf understand that the value is missing.

In [22]:
#check if whitespace gone
pd.DataFrame({
    'nulls': df_text[['description','input_description','output_description']].isna().sum(),
    'empty_strings': (df_text[['description','input_description','output_description']] == "").sum(),
    'fake_null': (df_text[['description','input_description','output_description']].isin(['none','nan','null'])).sum()
})


Unnamed: 0,nulls,empty_strings,fake_null
description,81,0,0
input_description,120,0,0
output_description,131,0,0


In [None]:
from sklearn.impute import SimpleImputer

df_text[text_cols] = df_text[text_cols].fillna("missing")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text[text_cols] = df_text[text_cols].fillna("missing")


In [None]:
#check if whitespace gone
pd.DataFrame({
    'nulls': df_text[['description','input_description','output_description']].isna().sum(),
    'empty_strings': (df_text[['description','input_description','output_description']] == "").sum(),
    'fake_null': (df_text[['description','input_description','output_description']].isin(['none','nan','null'])).sum()
})


Unnamed: 0,nulls,empty_strings,fake_null
description,0,0,0
input_description,0,0,0
output_description,0,0,0


In [None]:
#cleaned table
df_text.loc[0]

Unnamed: 0,0
description,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number
input_description,the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num
output_description,the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph
problem_class,hard
problem_score,9.7


In [None]:
#uncleaned table
df.loc[0]

Unnamed: 0,0
title,Uuu
description,"Unununium (Uuu) was the name of the chemical\n element with atom number 111, until it changed to\n Röntgenium (Rg) in 2004. These heavy elements are very\n unstable and have only been synthesized in a few\n laboratories.\nYou have just been hired by one of these labs to optimize\n the algorithms used in simulations. For example, when\n simulating complicated chemical reactions, it is important to\n keep track of how many particles there are, and this is done by\n counting connected components in a graph.\nCurrently, the lab has some Python code (see attachments)\n that takes an undirected graph and outputs the number of\n connected components. As you can see, this code is based on\n everyone’s favourite data structure union-find1.\nAfter looking at the code for a while, you notice that it\n actually has a bug in it! The code still gives correct answers,\n but the bug could cause it to run inefficiently. Your task is\n to construct a graph with a given number of vertices and edges\n where the code runs very slowly. We will count how many times\n the third line (the one inside the while loop) is visited, and\n your program will get a score according to this number.\n"
input_description,"The input consists of one line with two integers\n $N$ and $M$, the number of vertices and edges\n your graph should have. Apart from the sample, there will be\n only one test case, with $N =\n 100$ and $M =\n 500$."
output_description,"The output consists of $M$ lines where the $i$:th contains two integers\n $u_ i$ and $v_ i$ ($1 \leq u_ i, v_ i \leq N$). This\n indicates that the vertices $u_\n i$ and $v_ i$ are\n connected with an edge in your graph."
sample_io,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 5 6 6 7 5 7 1 7 7 2 5 1'}]"
problem_class,hard
problem_score,9.7
url,https://open.kattis.com/problems/uuu


In [None]:
#checking any random row to see if we can do anymore cleaning
row = df_text.sample(1).index[0]
df_text.loc[row]

Unnamed: 0,1363
description,given an input string composed solely of lowercase english letters find the longest substring that occurs more than once in the input string the two occurrences are allowed to partially overlap
input_description,the input is a single line containing a string of lowercase letters the string contains more than one character but no more than num ^ num at least one letter will appear at least twice
output_description,print a single line of output the longest substring that occurs more than once in the input string if there are multiple longest repeated substrings print the one the would come first when the longest substrings are sorted in lexicographical alphabetical order
problem_class,hard
problem_score,6.3


In [None]:
#finally adding all three columns.

df_text['text'] = (df_text['description'].fillna('') + ' | ' +
                         df_text['input_description'].fillna('') + ' | ' +
                         df_text['output_description'].fillna('')).str.strip()

# Clean separator: replace multiple | with single space if needed
df_text['text'] = df_text['text'].str.replace(r'\s*\|\s*', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text'] = (df_text['description'].fillna('') + ' | ' +
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['text'] = df_text['text'].str.replace(r'\s*\|\s*', ' ', regex=True)


In [None]:
df_text.loc[0]

Unnamed: 0,0
description,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number
input_description,the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num
output_description,the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph
problem_class,hard
problem_score,9.7
text,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph


In [None]:
df_text=df_text[['text','problem_class','problem_score']]

In [None]:
df_text.loc[0]

Unnamed: 0,0
text,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph
problem_class,hard
problem_score,9.7


In [None]:
difficulty_map = {'easy': 0, 'medium': 1, 'hard': 2}
df_text['problem_level'] = df_text['problem_class'].map(difficulty_map)

In [None]:
df_text=df_text[['text','problem_level','problem_score']]

In [None]:
df_text.loc[0]

Unnamed: 0,0
text,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph
problem_level,2
problem_score,9.7


In [None]:
#checking any specific row just to compare the jsnol and actual
df_text.loc[9]

Unnamed: 0,9
text,you are given a simple undirected graph with no self loops or multiple edges some of the edges are marked as special your task is to find a simple cycle where for each special edge that edge either belongs to the cycle or neither of its endpoints touch the cycle the cycle is not allowed to repeat vertices output any solution or report that none exist the first line of input contains three integers n num ≤ n ≤ num m num ≤ m ≤ frac n × n- num num and k num ≤ k ≤ m where n is the number of nodes in the graph m is the number of edges and k is the number of edges that are special the nodes are numbered num through n output an integer denoting the length of the found cycle on one line on subsequent lines output the vertices of the cycle in order around the cycle one per line if no such cycle exists simply output - num
problem_level,2
problem_score,9.5


In [None]:
#check if whitespace gone after merging
pd.DataFrame({
    'nulls': df_text[['text']].isna().sum(),
    'empty_strings': (df_text[['text']] == "").sum(),
    'fake_null': (df_text[['text']].isin(['none','nan','null'])).sum()
})

Unnamed: 0,nulls,empty_strings,fake_null
text,0,0,0


In [None]:
df_text.shape

(4112, 3)

In [None]:
df_text.to_json('problems_data_cleaned.jsonl', orient='records', lines=True)

In conclusion:
1. we converted everything into lower case, removed empty and null values, preserved math symbols and converted unicodes.
2. droppped unnecessary columns, gave numbers to hard,medium,easy classes.

#**FEATURE EXTRACTION**

In [None]:
def create_features(text_series):
    features = pd.DataFrame(index=text_series.index)

    # Lengths
    features['text_len'] = text_series.str.len()
    features['word_count'] = text_series.str.split().str.len()

    # MATH: Join symbols (pandas str.count fix!)
    math_pattern = r'[≤≥×≠≈∑∏√∀∃]'
    features['math_count'] = text_series.str.count(math_pattern)
    features['math_density'] = features['math_count'] / features['text_len'].replace(0, 1)

    # ALGO: Use .str.contains() + sum for presence
    algo_mega = r'(graph|tree|node|edge|dfs|bfs|dp|dynamic|knapsack|lcs|fibonacci|' + \
                r'modulo|matrix|prime|gcd|lcm|substring|palindrome|kmp|trie|greedy)'
    features['algo_mentions'] = text_series.str.count(algo_mega, flags=re.I | re.U)

    # Other features
    features['constraints'] = text_series.str.count(r'[≤≥]')
    features['complexity'] = text_series.str.count(r'O\(|10\^')

    return features.fillna(0)


features = create_features(df_text['text'])


In [None]:
features.head()

Unnamed: 0,text_len,word_count,math_count,math_density,algo_mentions,constraints,complexity
0,1503,286,2,0.001331,8,2,0
1,1266,247,1,0.00079,0,0,0
2,1172,230,1,0.000853,0,1,0
3,1227,230,2,0.00163,0,2,0
4,2024,400,4,0.001976,8,4,0


In [None]:
df_text.head()

Unnamed: 0,text,problem_level,problem_score
0,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph,2,9.7
1,a number of eccentrics from central new york have decided that they have had enough of modern society and want to move from there together they have bought a rectangular piece of land far away and will now settle there the land consists of n times m squares and it is possible to build a maximum of one house on a given square each square has value a x y that describes how nice it is on a scale between num and num the goal of the eccentrics is to get as far away as possible from everyone else including each other the happiness an eccentric experiences from building his house on square x y is thus a x y × d where d is the smallest distance to another person out of habit the eccentrics use manhattan distance to measure this; d is defined as min x - x num + y - y num over all other peoples squares x num y num the eccentrics now want your help in placing their houses optimally so that the sum of the happiness they experience is as high as possible can you help them the input consists of num test cases which are described below print k lines with the positions of the houses each line should contain two numbers first the row for the house between num and n then the column between num and m two houses may not be placed at the same position,2,9.7
2,mario and luigi are playing a game where they pick distinct numbers m l num ≤ m l < num ^ num ^ num in order to place careful bets on the outcome of the game you wish to know whose number is larger both mario and luigi have already shared their secret numbers with their close friend toadette who has memorized both of their numbers as binary numbers with num ^ num digits so you decide to go to toadette for help fortunately toadette is willing to help you and lets you ask her questions of one of two following types give two integers a and b and ask if you write out m and l in binary are m s bits in the inclusive range [a b] equal to l s bits in the same range toadette responds yes or no give an integer x and ask is the x th bit of m or l greater toadette responds with mario luigi or equal however toadette is afraid that her answers to questions of the first type gives you too much information so she decides to make things interesting each time you ask a question of the first type she will lie to you independently and randomly with probability frac num num can you find out whose number is larger by asking at most num questions missing missing,2,9.6
3,zofka is bending a copper wire she starts with a straight wire placed on the table with the starting point glued to the middle of the table she then repeatedly picks a point on the wire and bends the part starting at that point away from the starting point by num degrees either clockwise or counterclockwise throughout the process the starting point stays glued to the middle of the table the most important consideration is that she does not want the wire to touch itself as she bends it that would summon the wire ghost she needs your help she has a list of points together with the direction at each point clockwise or counterclockwise she wants to know if bending the wire at the listed points in the given order would cause the wire ghost to appear at any time during the process the first line contains two integers l and n where l is the length of the wire and n is the number of points each of the next n lines contains a number from num dots l describing the point on the wire followed by w clockwise or c counter clockwise you may assume l≤ num num and n≤ num the output consists of a single line consisting of the string ghost if the wire would touch itself during the bending and the string safe otherwise,2,9.6
4,your dog spot is let loose in the park well relatively loose he is tied to a post with a leash limiting his movements spread around the park are various squeaky toys and other dog paraphernalia which spot happily goes after when he sees them when he gets to a toy he will chew at it for a while until it has become defunct at which point he will go after the next toy which looks much squeakier this is all very well but there are obstacles to spots joyful canine play trees in the park there are several trees and if spot walks around a tree his leash gets wrapped around the tree making his movements more limited being a dog with pressing squeaky matters to attend to spot does not really have time to take things such as trees into account and always goes directly in a straight line for his next toy if he cant get to his next toy because he has run out of leash spot will start barking uncontrollably as no doubt any of us would and you have to help him how long would spots leash have to be in order for him to run out of toys before he runs out of leash for practical purposes you may assume that when seen from above spot his toys and the trees are points and that the post that the leash is tied to will not hinder spots movements in any way after having finished chewing a toy spot always goes for the most shiny unchewed toy the post to which spots leash is tied is located at coordinates num and this is also where spot is initially located the first line of input consists of two integers n and m where num ≤ n ≤ num is the number of toys in the park and num ≤ m ≤ num is the number of trees in the park then follow n lines each containing two integers x y giving the coordinates of a toy the toys are listed in decreasing order of shininess this is followed by m lines each containing two integers x y indicating that there is a tree at those coordinates write a single line containing the length needed for the leash in order for spot to be able to get to all his toys rounded to two decimal digits,2,9.6


In [None]:
'''# Install & download
!pip install spacy
!python -m spacy download en_core_web_sm'''

'# Install & download\n!pip install spacy\n!python -m spacy download en_core_web_sm'

In [None]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
#to remove stop words like: to for the

def preprocess(text):
    doc = nlp(text)

    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

In [None]:
df_text["clean_text"] = df_text['text'].apply(preprocess)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
df_text.head(1)
#to see comparitively

Unnamed: 0,text,problem_level,problem_score,clean_text
0,unununium was the name of the chemical element with atom number num until it changed to rontgenium rg in num these heavy elements are very unstable and have only been synthesized in a few laboratories you have just been hired by one of these labs to optimize the algorithms used in simulations for example when simulating complicated chemical reactions it is important to keep track of how many particles there are and this is done by counting connected components in a graph currently the lab has some python code see attachments that takes an undirected graph and outputs the number of connected components as you can see this code is based on everyones favourite data structure union find num after looking at the code for a while you notice that it actually has a bug in it the code still gives correct answers but the bug could cause it to run inefficiently your task is to construct a graph with a given number of vertices and edges where the code runs very slowly we will count how many times the third line the one inside the while loop is visited and your program will get a score according to this number the input consists of one line with two integers n and m the number of vertices and edges your graph should have apart from the sample there will be only one test case with n = num and m = num the output consists of m lines where the i th contains two integers u i and v i num ≤ u i v i ≤ n this indicates that the vertices u i and v i are connected with an edge in your graph,2,9.7,unununium chemical element atom number num changed rontgenium rg num heavy elements unstable synthesized laboratories hired labs optimize algorithms simulations example simulating complicated chemical reactions important track particles counting connected components graph currently lab python code attachments takes undirected graph outputs number connected components code based everyones favourite data structure union find num looking code notice actually bug code gives correct answers bug cause run inefficiently task construct graph given number vertices edges code runs slowly count times line inside loop visited program score according number input consists line integers n m number vertices edges graph apart sample test case n = num m = num output consists m lines th contains integers u v num ≤ u v ≤ n indicates vertices u v connected edge graph


In [None]:
df_text=df_text[['clean_text','problem_level','problem_score']]

In [None]:
df_text.loc[0]

Unnamed: 0,0
clean_text,unununium chemical element atom number num changed rontgenium rg num heavy elements unstable synthesized laboratories hired labs optimize algorithms simulations example simulating complicated chemical reactions important track particles counting connected components graph currently lab python code attachments takes undirected graph outputs number connected components code based everyones favourite data structure union find num looking code notice actually bug code gives correct answers bug cause run inefficiently task construct graph given number vertices edges code runs slowly count times line inside loop visited program score according number input consists line integers n m number vertices edges graph apart sample test case n = num m = num output consists m lines th contains integers u v num ≤ u v ≤ n indicates vertices u v connected edge graph
problem_level,2
problem_score,9.7


In [None]:
"""min_samples = df_text['problem_level'].value_counts().min()  # 766

# ALL need replace=True for safety
df_easy = df_text[df_text['problem_level']==0].sample(min_samples, replace=True, random_state=2022)
df_medium = df_text[df_text['problem_level']==1].sample(min_samples, replace=True, random_state=2022)
df_hard = df_text[df_text['problem_level']==2].sample(min_samples, replace=True, random_state=2022)

df_balanced = pd.concat([df_easy, df_medium, df_hard], ignore_index=True)
print("Balanced:", df_balanced.shape)
print(df_balanced['problem_level'].value_counts())"""

#Just incase if we think of reducing data instead adding weight

'min_samples = df_text[\'problem_level\'].value_counts().min()  # 766\n\n# ALL need replace=True for safety\ndf_easy = df_text[df_text[\'problem_level\']==0].sample(min_samples, replace=True, random_state=2022)\ndf_medium = df_text[df_text[\'problem_level\']==1].sample(min_samples, replace=True, random_state=2022)\ndf_hard = df_text[df_text[\'problem_level\']==2].sample(min_samples, replace=True, random_state=2022)\n\ndf_balanced = pd.concat([df_easy, df_medium, df_hard], ignore_index=True)\nprint("Balanced:", df_balanced.shape)\nprint(df_balanced[\'problem_level\'].value_counts())'

In [None]:
#df_text

In [None]:
'''from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

corpus_processed = df_text['clean_text'].dropna().tolist()

v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)

print("Vocab size:", len(v.vocabulary_))
print("Sample:", dict(list(v.vocabulary_.items())[:5]))

# FULL matrix
X_matrix = v.transform(corpus_processed).toarray()
print("Matrix:", X_matrix.shape)
df_vectors = pd.DataFrame(X_matrix, columns=v.get_feature_names_out())'''


'''output:
Vocab size: 317218
Sample: {'unununium': 299269, 'uuu': 300785, 'chemical': 47592, 'element': 91426, 'atom': 22839}
Matrix: (4112, 317218)'''


"output:\nVocab size: 317218\nSample: {'unununium': 299269, 'uuu': 300785, 'chemical': 47592, 'element': 91426, 'atom': 22839}\nMatrix: (4112, 317218)"

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=10000,     # limit vocabulary
    min_df=5,               # remove ultra-rare words
    max_df=0.9,             # remove ultra-common words
    ngram_range=(1, 2),     # unigrams + bigrams
    token_pattern=r'(?u)\b[a-zA-Z]{4,}\b'  # words with 3+ letters
)

X = tfidf.fit_transform(df_text['clean_text'])


In [None]:
print(X.shape)

(4112, 10000)


In [None]:
X[:1].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [None]:
feature_names = tfidf.get_feature_names_out()
print(len(feature_names))
print(feature_names[:30])

10000
['abba' 'abcd' 'abilities' 'ability' 'able' 'able build' 'able complete'
 'able determine' 'able enter' 'able find' 'able hold' 'able leave'
 'able pass' 'able reach' 'able solve' 'absence' 'absent' 'absolute'
 'absolute difference' 'absolute error' 'absolute relative'
 'absolute value' 'absolute values' 'absolutely' 'abstract' 'accelerate'
 'acceleration' 'accept' 'acceptable' 'accepted']


In [None]:
'''TECH_WORDS = {
    'graph', 'array', 'string', 'tree', 'node', 'edge', 'vertex',
    'dynamic', 'programming', 'recursion', 'dfs', 'bfs',
    'complexity', 'constraint', 'time', 'memory',
    'input', 'output', 'integer', 'number', 'limit',
    'matrix', 'grid', 'path', 'search'
}'''

"TECH_WORDS = {\n    'graph', 'array', 'string', 'tree', 'node', 'edge', 'vertex',\n    'dynamic', 'programming', 'recursion', 'dfs', 'bfs',\n    'complexity', 'constraint', 'time', 'memory',\n    'input', 'output', 'integer', 'number', 'limit',\n    'matrix', 'grid', 'path', 'search'\n}"

In [None]:
X[0].count_nonzero()


97

In [None]:
df_enhanced = pd.concat([df_text[['problem_level']], features], axis=1)

In [None]:
df_enhanced.loc[0]

Unnamed: 0,0
problem_level,2.0
text_len,1503.0
word_count,286.0
math_count,2.0
math_density,0.001331
algo_mentions,8.0
constraints,2.0
complexity,0.0


#**MODELS**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Your data (X from TF-IDF, y = problem_level)
y = df_text['problem_level']  # 0=easy,1=medium,2=hard

print("Class distribution:")
print(y.value_counts().sort_index())

# Split (stratified = keeps imbalance ratios)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2022, stratify=y
)

# BEST FOR IMBALANCE: RandomForest + balanced weights
model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',  # Fixes imbalance automatically!
    random_state=2022,
    n_jobs=-1
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\n Results:")
print(classification_report(y_test, y_pred))

'''# Confusion Matrix (visual)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.show()'''


Class distribution:
problem_level
0     766
1    1405
2    1941
Name: count, dtype: int64

 Results:
              precision    recall  f1-score   support

           0       0.49      0.33      0.39       153
           1       0.40      0.17      0.24       281
           2       0.53      0.82      0.65       389

    accuracy                           0.51       823
   macro avg       0.47      0.44      0.43       823
weighted avg       0.48      0.51      0.46       823



"# Confusion Matrix (visual)\ncm = confusion_matrix(y_test, y_pred)\nsns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\nplt.show()"

In [None]:
# 2. XGBoost (state-of-the-art)
from xgboost import XGBClassifier
xgb = XGBClassifier(scale_pos_weight=2, random_state=2022)  # Boost minority
xgb.fit(X_train, y_train)
print("XGBoost:", classification_report(y_test, xgb.predict(X_test)))

# 3. Logistic Regression (TF-IDF loves it)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=2022)
lr.fit(X_train, y_train)
print("Logistic:", classification_report(y_test, lr.predict(X_test)))


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost:               precision    recall  f1-score   support

           0       0.45      0.33      0.38       153
           1       0.43      0.32      0.37       281
           2       0.58      0.75      0.65       389

    accuracy                           0.52       823
   macro avg       0.49      0.47      0.47       823
weighted avg       0.50      0.52      0.50       823

Logistic:               precision    recall  f1-score   support

           0       0.42      0.47      0.44       153
           1       0.38      0.38      0.38       281
           2       0.55      0.52      0.53       389

    accuracy                           0.46       823
   macro avg       0.45      0.46      0.45       823
weighted avg       0.47      0.46      0.47       823



In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Perfect for TF-IDF sparse matrices
nb = MultinomialNB(alpha=0.1)  # Laplace smoothing
nb.fit(X_train, y_train)

print("Naive Bayes:")
print(classification_report(y_test, nb.predict(X_test)))

# With balanced prior (helps imbalance)
nb_bal = MultinomialNB(alpha=0.1, fit_prior=False, class_prior=[0.3, 0.3, 0.4])
nb_bal.fit(X_train, y_train)
print("\nNB Balanced Prior:")
print(classification_report(y_test, nb_bal.predict(X_test)))


Naive Bayes:
              precision    recall  f1-score   support

           0       0.50      0.26      0.34       153
           1       0.39      0.32      0.35       281
           2       0.53      0.70      0.60       389

    accuracy                           0.49       823
   macro avg       0.47      0.43      0.43       823
weighted avg       0.48      0.49      0.47       823


NB Balanced Prior:
              precision    recall  f1-score   support

           0       0.41      0.44      0.43       153
           1       0.41      0.31      0.35       281
           2       0.56      0.65      0.60       389

    accuracy                           0.49       823
   macro avg       0.46      0.46      0.46       823
weighted avg       0.48      0.49      0.48       823



In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=2022)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)  # Your original X_train

nb_sm = MultinomialNB(alpha=0.1)
nb_sm.fit(X_train_sm, y_train_sm)
print("SMOTE + NB:")
print(classification_report(y_test, nb_sm.predict(X_test)))


SMOTE + NB:
              precision    recall  f1-score   support

           0       0.42      0.42      0.42       153
           1       0.39      0.37      0.38       281
           2       0.56      0.59      0.57       389

    accuracy                           0.48       823
   macro avg       0.46      0.46      0.46       823
weighted avg       0.48      0.48      0.48       823



In [None]:
from sklearn.naive_bayes import MultinomialNB

# BETTER TF-IDF (key fix)
tfidf_opt = TfidfVectorizer(
    max_features=15000,      # Critical: reduce noise
    min_df=5, max_df=0.8,   # Clean vocabulary
    ngram_range=(1,3),      # Trigrams catch "dp solution"
    sublinear_tf=True,      # Log scaling
)

X_opt = tfidf_opt.fit_transform(df_text['clean_text'])
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(
    X_opt, y, test_size=0.2, random_state=2022, stratify=y
)

nb_opt = MultinomialNB(alpha=0.5)
nb_opt.fit(X_train_opt, y_train_opt)
print(" Optimized NB:")
print(classification_report(y_test_opt, nb_opt.predict(X_test_opt)))


 Optimized NB:
              precision    recall  f1-score   support

           0       0.53      0.10      0.17       153
           1       0.46      0.18      0.26       281
           2       0.51      0.89      0.65       389

    accuracy                           0.50       823
   macro avg       0.50      0.39      0.36       823
weighted avg       0.50      0.50      0.43       823



In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, chi2

# Reduce to top 5K features
selector = SelectKBest(chi2, k=5000)
X_train_chi, X_test_chi = selector.fit_transform(X_train, y_train), selector.transform(X_test)

svc = LinearSVC(class_weight='balanced', random_state=2022)
svc.fit(X_train_chi, y_train)
print("LinearSVC + Chi2:")
print(classification_report(y_test, svc.predict(X_test_chi)))


LinearSVC + Chi2:
              precision    recall  f1-score   support

           0       0.42      0.40      0.41       153
           1       0.40      0.40      0.40       281
           2       0.55      0.56      0.55       389

    accuracy                           0.47       823
   macro avg       0.46      0.45      0.45       823
weighted avg       0.47      0.47      0.47       823



In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# SMOTE on training data only
smote = SMOTE(random_state=2022, k_neighbors=3)  # k=3 for small classes
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)  # Your (2,3)-gram X_train

print("SMOTE balanced train:", pd.Series(y_train_sm).value_counts().sort_index())

# RF with BOTH protections
rf_smote = RandomForestClassifier(
    n_estimators=300,
    class_weight='balanced',  # Extra protection
    max_depth=15,            # Prevent overfit
    random_state=2022,
    n_jobs=-1
)

rf_smote.fit(X_train_sm, y_train_sm)
y_pred_sm = rf_smote.predict(X_test)

print("\n🚀 SMOTE + RF Balanced:")
print(classification_report(y_test, y_pred_sm))


SMOTE balanced train: problem_level
0    1552
1    1552
2    1552
Name: count, dtype: int64

🚀 SMOTE + RF Balanced:
              precision    recall  f1-score   support

           0       0.42      0.44      0.43       153
           1       0.38      0.16      0.22       281
           2       0.54      0.76      0.63       389

    accuracy                           0.50       823
   macro avg       0.45      0.45      0.43       823
weighted avg       0.46      0.50      0.46       823



#**MODELS-REGRESSOR**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

# Your score column (1-10)
y_score = df_text['problem_score']  # ← Confirm column name!
print("Score stats:")
print(y_score.describe())
print("Score distribution:\n", y_score.value_counts().sort_index())

# Split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X, y_score, test_size=0.2, random_state=2022  # Your TF-IDF X (2,3)
)

'''# 1. RandomForest (best start)
rf_score = RandomForestRegressor(n_estimators=300, random_state=2022)
rf_score.fit(X_train_s, y_train_s)
score_pred = rf_score.predict(X_test_s)

print("\n🎯 RandomForest 1-10 Score:")
print("R²:", r2_score(y_test_s, score_pred).round(3))
print("MAE:", mean_absolute_error(y_test_s, score_pred).round(2))  # ~0.8-1.2 expected
print("RMSE:", np.sqrt(mean_squared_error(y_test_s, score_pred)).round(2))

# 2. Ridge Regression
ridge_score = Ridge(alpha=10)
ridge_score.fit(X_train_s, y_train_s)
print("\nRidge:")
print("MAE:", mean_absolute_error(y_test_s, ridge_score.predict(X_test_s)).round(2))

# Sample predictions
print("\nSample predictions:")
for i in range(3):
    print(f"True: {y_test_s.iloc[i]:.1f}, Pred: {score_pred[i]:.1f}")'''


Score stats:
count    4112.000000
mean        5.114689
std         2.177770
min         1.100000
25%         3.300000
50%         5.200000
75%         6.900000
max         9.700000
Name: problem_score, dtype: float64
Score distribution:
 problem_score
1.1     9
1.2    21
1.3    29
1.4    36
1.5    64
       ..
9.3    14
9.4    13
9.5     7
9.6     5
9.7     2
Name: count, Length: 87, dtype: int64


'# 1. RandomForest (best start)\nrf_score = RandomForestRegressor(n_estimators=300, random_state=2022)\nrf_score.fit(X_train_s, y_train_s)\nscore_pred = rf_score.predict(X_test_s)\n\nprint("\n🎯 RandomForest 1-10 Score:")\nprint("R²:", r2_score(y_test_s, score_pred).round(3))\nprint("MAE:", mean_absolute_error(y_test_s, score_pred).round(2))  # ~0.8-1.2 expected\nprint("RMSE:", np.sqrt(mean_squared_error(y_test_s, score_pred)).round(2))\n\n# 2. Ridge Regression\nridge_score = Ridge(alpha=10)\nridge_score.fit(X_train_s, y_train_s)\nprint("\nRidge:")\nprint("MAE:", mean_absolute_error(y_test_s, ridge_score.predict(X_test_s)).round(2))\n\n# Sample predictions\nprint("\nSample predictions:")\nfor i in range(3):\n    print(f"True: {y_test_s.iloc[i]:.1f}, Pred: {score_pred[i]:.1f}")'

In [None]:
# 2. Ridge Regression
ridge_score = Ridge(alpha=10)
ridge_score.fit(X_train_s, y_train_s)
print("\nRidge:")
print("MAE:", mean_absolute_error(y_test_s, ridge_score.predict(X_test_s)))

# Sample predictions
print("\nSample predictions:")
for i in range(3):
    print(f"True: {y_test_s.iloc[i]:.1f}, Pred: {score_pred[i]:.1f}")


Ridge:
MAE: 1.772479240173549

Sample predictions:
True: 6.4, Pred: 5.4
True: 5.6, Pred: 6.0
True: 4.7, Pred: 6.0


In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.feature_extraction.text import TfidfVectorizer

# Better TF-IDF for regression
tfidf_reg = TfidfVectorizer(
    max_features=8000,      # Reduce noise
    min_df=3, ngram_range=(1,3),
    sublinear_tf=True
)

X_reg = tfidf_reg.fit_transform(df_text['clean_text'])
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_score, test_size=0.2, random_state=2022
)

# Ridge = TF-IDF king for regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train_reg, y_train_reg)
score_ridge = ridge_reg.predict(X_test_reg)

print("Ridge Regressor:")
print("R²:", round(r2_score(y_test_reg, score_ridge), 3))
print("MAE:", round(mean_absolute_error(y_test_reg, score_ridge), 2))


Ridge Regressor:
R²: 0.117
MAE: 1.7


In [None]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(n_estimators=300, random_state=2022)
xgb_reg.fit(X_train_reg, y_train_reg)  # Use optimized X
score_xgb = xgb_reg.predict(X_test_reg)

print("XGBoost Regressor:")
print("R²:", round(r2_score(y_test_reg, score_xgb), 3))
print("MAE:", round(mean_absolute_error(y_test_reg, score_xgb), 2))


XGBoost Regressor:
R²: -0.019
MAE: 1.81


In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(f_regression, k=3000)
X_train_sel = selector.fit_transform(X_train_reg, y_train_reg)
X_test_sel = selector.transform(X_test_reg)

ridge_sel = Ridge()
ridge_sel.fit(X_train_sel, y_train_reg)
print("Ridge + Top 3K features:")
print("R²:", round(r2_score(y_test_reg, ridge_sel.predict(X_test_sel)), 3))


Ridge + Top 3K features:
R²: 0.125
