#**FEATURE EXTRACTION**

**SUMMARY**
1. Extract features that can help model think whether a problem is easy,tough,medium.

In [5]:
import pandas as pd
df=pd.read_json("/content/problems_data_cleaned.jsonl",lines=True)

In [12]:
df.head(2)

Unnamed: 0,text,problem_level,problem_score
0,unununium was the name of the chemical eleme...,2,9.7
1,a number of eccentrics from central new york h...,2,9.7


In [6]:
import re
import pickle
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack


In [7]:

def has_pattern(text, pattern_list):
    return int(any(p in text for p in pattern_list))

In [10]:
import pandas as pd
import numpy as np
import re

def create_feature_table(df, text_col='text'):
    features = pd.DataFrame(index=df.index)

    # 1. Basic Length Features
    features['word_count'] = df[text_col].apply(lambda x: len(str(x).split()))
    features['char_count'] = df[text_col].apply(lambda x: len(str(x)))
    features['avg_word_len'] = df[text_col].apply(
        lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split()) > 0 else 0
    )

    # 2. Complexity / Diversity Features
    features['unique_word_count'] = df[text_col].apply(lambda x: len(set(str(x).split())))
    features['lexical_richness'] = features['unique_word_count'] / (features['word_count'] + 1)

    # 3. Specific Token Counts
    # How many numbers were in the original (now 'num')
    features['num_density'] = df[text_col].apply(lambda x: str(x).split().count('num'))

    # 4. Programming Domain Keywords (Binary or Counts)
    # These often help identify the "topic" of the problem
    topics = {
        'graph': ['graph', 'tree', 'node', 'edge', 'path', 'connected'],
        'math': ['prime', 'modulo', 'divisor', 'probability', 'geometry'],
        'strings': ['string', 'substring', 'suffix', 'prefix', 'palindrome'],
        'optimization': ['minimum', 'maximum', 'optimize', 'optimal', 'shortest']
    }

    for topic, keywords in topics.items():
        features[f'topic_{topic}'] = df[text_col].apply(
            lambda x: sum([str(x).count(kw) for kw in keywords])
        )

    return features

# Generate the table
features_df = create_feature_table(df)

# Show result
features_df.head()

# Save for later use
features_df.to_csv('metadata_features.csv', index=False)

In [11]:
features_df.head(2)

Unnamed: 0,word_count,char_count,avg_word_len,unique_word_count,lexical_richness,num_density,topic_graph,topic_math,topic_strings,topic_optimization
0,286,1503,4.216783,147,0.512195,6,11,0,0,1
1,247,1266,4.064777,135,0.544355,9,0,0,0,2


In [13]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5
)

X_text = tfidf.fit_transform(df["text"])


In [14]:
feature_names = tfidf.get_feature_names_out()
print(len(feature_names))
print(feature_names[:30])

5000
['ab' 'ability' 'able' 'able to' 'about' 'about the' 'above' 'above the'
 'absolute' 'absolute error' 'absolute or' 'absolute value' 'accepted'
 'access' 'access to' 'according' 'according to' 'account' 'accurate'
 'accurate to' 'achieve' 'achieved' 'acm' 'across' 'across the' 'action'
 'activities' 'actual' 'actually' 'add']


In [15]:

scaler = StandardScaler()
X_struct = scaler.fit_transform(features_df)


In [16]:
X_final = hstack([X_text, X_struct])

y = df["problem_level"]


In [17]:
print("TF-IDF shape:", X_text.shape)
print("Feature table shape:", X_struct.shape)
print("Final X shape:", X_final.shape)


TF-IDF shape: (3899, 5000)
Feature table shape: (3899, 10)
Final X shape: (3899, 5010)


In [18]:
df.head()

Unnamed: 0,text,problem_level,problem_score
0,unununium was the name of the chemical eleme...,2,9.7
1,a number of eccentrics from central new york h...,2,9.7
2,zofka is bending a copper wire she starts with...,2,9.6
3,your dog spot is let loose in the park well re...,2,9.6
4,three rival gangs of bandits the marauders the...,2,9.5


#**REmove stop WORDS AND APPLY**
DONT RUN THIS SUB SECTION
JUST CHECKING IF WE REMOVE THE STOP WORDS WHAT IS THE ACCURACY.

In [19]:
'''# Install & download
!pip install spacy
!python -m spacy download en_core_web_sm'''

'# Install & download\n!pip install spacy\n!python -m spacy download en_core_web_sm'

In [None]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
#to remove stop words like: to for the

def preprocess(text):
    doc = nlp(text)

    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
df["clean_text"] = df_text['text'].apply(preprocess)

In [None]:
df_text.head(1)
#to see comparitively

Unnamed: 0,text,problem_level,problem_score,clean_text
0,unununium was the name of the chemical eleme...,2,9.7,unununium chemical element atom number nu...


In [None]:
df_text=df_text[['clean_text','problem_level','problem_score']]

In [None]:
#tfidf after removing stop words
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5
)

X_text = tfidf.fit_transform(df_text["clean_text"])


In [None]:
feature_names = tfidf.get_feature_names_out()
print(len(feature_names))
print(feature_names[:30])

5000
['ab' 'ability' 'able' 'absolute' 'absolute error' 'absolute relative'
 'absolute value' 'absolutely' 'acceleration' 'accept' 'acceptable'
 'accepted' 'access' 'accidentally' 'accomplish' 'according'
 'according following' 'account' 'accurate' 'accurate absolute' 'achieve'
 'achieved' 'acm' 'acm icpc' 'act' 'action' 'actions' 'activated' 'active'
 'activities']


In [None]:
"""min_samples = df_text['problem_level'].value_counts().min()  # 766

# ALL need replace=True for safety
df_easy = df_text[df_text['problem_level']==0].sample(min_samples, replace=True, random_state=2022)
df_medium = df_text[df_text['problem_level']==1].sample(min_samples, replace=True, random_state=2022)
df_hard = df_text[df_text['problem_level']==2].sample(min_samples, replace=True, random_state=2022)

df_balanced = pd.concat([df_easy, df_medium, df_hard], ignore_index=True)
print("Balanced:", df_balanced.shape)
print(df_balanced['problem_level'].value_counts())"""

#Just incase if we think of reducing data instead adding weight

'min_samples = df_text[\'problem_level\'].value_counts().min()  # 766\n\n# ALL need replace=True for safety\ndf_easy = df_text[df_text[\'problem_level\']==0].sample(min_samples, replace=True, random_state=2022)\ndf_medium = df_text[df_text[\'problem_level\']==1].sample(min_samples, replace=True, random_state=2022)\ndf_hard = df_text[df_text[\'problem_level\']==2].sample(min_samples, replace=True, random_state=2022)\n\ndf_balanced = pd.concat([df_easy, df_medium, df_hard], ignore_index=True)\nprint("Balanced:", df_balanced.shape)\nprint(df_balanced[\'problem_level\'].value_counts())'

#**CONTINUE**