In [2]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/train_data_With_features.csv")
test_data = pd.read_csv("data/test_data_with_features.csv")

In [3]:
train_students = pd.read_csv("processed_train_student.csv")
# Rename uid to id to match train_data
train_students = train_students.rename(columns={"UID": "id"})
# Merge train data with student data
merged_df = pd.merge(train_data, train_students, on="id", how="left")
train_data = merged_df.copy()

In [None]:
train_data

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features if needed
tfidf_embeddings = tfidf_vectorizer.fit_transform(train_data["answers"])

In [6]:
hand_features = ['char_count', 'word_count', 'java_keyword_count', 
                 'method_count', 'class_count', 
                 'NaN_count', 'comment_count']

X_struct = train_data[hand_features].fillna(0).values 

In [7]:
from scipy.sparse import hstack

X_tfidf = hstack([tfidf_embeddings, X_struct])

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# Load pretrained Sentence-BERT
bert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def get_bert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = bert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state  # (1, seq_len, hidden_size)
            # Average pooling
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

bert_embeddings = get_bert_embedding(train_data["answers"].tolist())

bert_array = np.array(bert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct  # already dense

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Now safely combine them
X_combined_bert = np.hstack([bert_array, X_struct_scaled])

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 44/44 [00:04<00:00, 10.64it/s]


In [9]:
codebert_model_name = "microsoft/codebert-base"
codebert_tokenizer = AutoTokenizer.from_pretrained(codebert_model_name)
codebert_model = AutoModel.from_pretrained(codebert_model_name)

def get_codebert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = codebert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = codebert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

codebert_embeddings = get_codebert_embedding(train_data["answers"].tolist())

codebert_array = np.array(codebert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

X_combined_codebert = np.hstack([codebert_array, X_struct_scaled])

100%|██████████| 44/44 [00:10<00:00,  4.17it/s]


In [10]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

tokenized_texts = train_data["answers"].apply(simple_preprocess).tolist()
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1)

def average_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)
    
word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in tokenized_texts])
    
X_struct_dense = X_struct.toarray() if hasattr(X_struct, "toarray") else X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Combine
X_combined_w2v = np.hstack([word2vec_array, X_struct_scaled])

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

embeddings = {
    "Word2Vec" : X_combined_w2v,
    "TF-IDF": X_tfidf,
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
}

y = train_data["FinalClass"].values  # Assuming 'score' is the target variable

for name, X in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> MSE: {mse:.4f}, R²: {r2:.4f}")

Word2Vec -> MSE: 45.5493, R²: -2.1642
TF-IDF -> MSE: 25.9406, R²: -0.8020
BERT -> MSE: 212.6893, R²: -13.7752
CodeBERT -> MSE: 191.9950, R²: -12.3376


In [14]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "TF-IDF": X_tfidf,
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
    "Word2Vec": X_combined_w2v
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["FinalClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: TF-IDF ===
LinearRegression     | MSE: 25.941
Ridge                | MSE: 12.774
Lasso                | MSE: 12.926
ElasticNet           | MSE: 13.666
DecisionTree         | MSE: 28.333


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


RandomForest         | MSE: 11.616
GradientBoosting     | MSE: 14.271
SVR                  | MSE: 13.326
KNN                  | MSE: 9.538
MLP                  | MSE: 5670594.852

=== Embedding: BERT ===
LinearRegression     | MSE: 212.689
Ridge                | MSE: 8.149
Lasso                | MSE: 9.798
ElasticNet           | MSE: 7.285
DecisionTree         | MSE: 13.444
RandomForest         | MSE: 6.208
GradientBoosting     | MSE: 8.148
SVR                  | MSE: 14.407


[WinError 2] Sistem belirtilen dosyayı bulamıyor
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\subprocess.py", line 1436, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


KNN                  | MSE: 9.693
MLP                  | MSE: 45.602

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 191.995
Ridge                | MSE: 8.111
Lasso                | MSE: 9.798
ElasticNet           | MSE: 7.285
DecisionTree         | MSE: 31.111
RandomForest         | MSE: 6.806
GradientBoosting     | MSE: 9.776
SVR                  | MSE: 17.015
KNN                  | MSE: 9.093
MLP                  | MSE: 13.751

=== Embedding: Word2Vec ===
LinearRegression     | MSE: 45.549
Ridge                | MSE: 7.412
Lasso                | MSE: 9.798
ElasticNet           | MSE: 7.285
DecisionTree         | MSE: 29.000
RandomForest         | MSE: 6.331
GradientBoosting     | MSE: 6.947
SVR                  | MSE: 13.742
KNN                  | MSE: 9.618
MLP                  | MSE: 60.517


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()

mlp = MLPRegressor(random_state=42)

embeddings = {
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
    "Word2Vec": X_combined_w2v
}


pipeline = Pipeline([
    ('scaler', scaler),
    ('mlp', mlp)
])

param_grid = {
    'mlp__hidden_layer_sizes': [(100,), (50, 50), (100, 50), (64, 64, 32)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.0001, 0.001, 0.01],  # L2 penalty
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__max_iter': [1000, 2000]
}

for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best parameters:", grid.best_params_)
    print("Best score (negative MSE):", grid.best_score_)

    best_model = grid.best_estimator_

    from sklearn.metrics import mean_squared_error
    y_pred = best_model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print("Validation MSE:", mse)


=== Embedding: BERT ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -50.19595597593614
Validation MSE: 30.38662233154869

=== Embedding: CodeBERT ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -37.376400625931446
Validation MSE: 10.122734490111133

=== Embedding: Word2Vec ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (64, 64, 32), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -22.60937962132116
Validation MSE: 46.438678025654795


  arr = np.array(param_list)


In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
word2vec_array = scaler.fit_transform(X_combined_w2v)
bert_array = scaler.fit_transform(X_combined_bert)
codebert_array = scaler.fit_transform(X_combined_codebert)
#tfidf_array = scaler.fit_transform(X_tfidf)

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "BERT": bert_array,
    "CodeBERT": codebert_array,
    "Word2Vec": word2vec_array
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["FinalClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: BERT ===
LinearRegression     | MSE: 248.131
Ridge                | MSE: 28.938
Lasso                | MSE: 9.821
ElasticNet           | MSE: 9.109
DecisionTree         | MSE: 22.222
RandomForest         | MSE: 6.364
GradientBoosting     | MSE: 7.903
SVR                  | MSE: 18.959
KNN                  | MSE: 11.938
MLP                  | MSE: 27.103

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 197.692
Ridge                | MSE: 18.696
Lasso                | MSE: 10.109
ElasticNet           | MSE: 7.956
DecisionTree         | MSE: 31.111
RandomForest         | MSE: 6.288
GradientBoosting     | MSE: 8.655
SVR                  | MSE: 18.894
KNN                  | MSE: 11.818
MLP                  | MSE: 8.500

=== Embedding: Word2Vec ===
LinearRegression     | MSE: 38.895
Ridge                | MSE: 18.057
Lasso                | MSE: 5.087
ElasticNet           | MSE: 5.675
DecisionTree         | MSE: 20.889
RandomForest         | MSE: 6.913
GradientBoosting

Now we will get our data ready to test the test set.

In [19]:
test_data

Unnamed: 0,id,NaN_count,answers,char_count,word_count,java_keyword_count,method_count,class_count,comment_count
0,4380745,1,recursion 1) collatz problemi. bu kısımda amac...,212997,24761,4249,990,157,378
1,8190737,4,recursion 1) collatz problemi. bu kısımda amac...,103887,12667,2034,516,131,79
2,8964453,9,müzik çalar simülasyonu bil 211 - laboratuvar ...,105989,13683,2331,569,84,104
3,2384475,2,2384475 recursion 1) collatz problemi. bu kısı...,129912,16263,2756,684,113,139
4,4540765,10,4540765 recursion 1) collatz problemi. bu kısı...,117853,14921,2864,614,81,140
5,6418019,4,6418019 recursion 1) collatz problemi. bu kısı...,130178,15554,2626,598,110,109
6,4114373,2,4114373 recursion 1) collatz problemi. bu kısı...,146646,16779,3163,740,167,60
7,5281670,5,5281670 müzik çalar simülasyonu bil 211 - labo...,100169,12914,1944,527,98,228
8,1280995,1,recursion 1) collatz problemi. bu kısımda amac...,158583,19100,3370,736,189,157
9,2051651,8,recursion 1) collatz problemi. bu kısımda amac...,44223,5549,635,250,56,22


In [24]:
# Create embeddings for test data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features if needed
tfidf_embeddings = tfidf_vectorizer.fit_transform(test_data["answers"])

hand_features = ['char_count', 'word_count', 'java_keyword_count', 
                 'method_count', 'class_count', 
                 'NaN_count', 'comment_count']

X_struct = test_data[hand_features].fillna(0).values 

Test_X_tfidf = hstack([tfidf_embeddings, X_struct])

#---------

bert_embeddings = get_bert_embedding(test_data["answers"].tolist())

bert_array = np.array(bert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct  # already dense

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Now safely combine them
Test_X_combined_bert = np.hstack([bert_array, X_struct_scaled])

#---------

codebert_embeddings = get_codebert_embedding(test_data["answers"].tolist())

codebert_array = np.array(codebert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

Test_X_combined_codebert = np.hstack([codebert_array, X_struct_scaled])

#---------
tokenized_texts = test_data["answers"].apply(simple_preprocess).tolist()

word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in tokenized_texts])
    
X_struct_dense = X_struct.toarray() if hasattr(X_struct, "toarray") else X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Combine
Test_X_combined_w2v = np.hstack([word2vec_array, X_struct_scaled])

100%|██████████| 13/13 [00:01<00:00,  9.78it/s]
100%|██████████| 13/13 [00:03<00:00,  4.00it/s]


In [27]:
# Use same TF-IDF vectorizer trained on train_data
Test_tfidf_embeddings = tfidf_vectorizer.transform(test_data["answers"])
Test_X_struct = test_data[hand_features].fillna(0).values

# Only transform structured features (do not re-fit!)
Test_X_struct_dense = Test_X_struct.toarray() if hasattr(Test_X_struct, "toarray") else Test_X_struct
Test_X_struct_scaled = scaler.transform(Test_X_struct_dense)

Test_X_tfidf = hstack([Test_tfidf_embeddings, Test_X_struct_dense])  # No need to scale for sparse

# === BERT ===
test_bert_embeddings = get_bert_embedding(test_data["answers"].tolist())
test_bert_array = np.array(test_bert_embeddings)
Test_X_combined_bert = np.hstack([test_bert_array, Test_X_struct_scaled])

# === CodeBERT ===
test_codebert_embeddings = get_codebert_embedding(test_data["answers"].tolist())
test_codebert_array = np.array(test_codebert_embeddings)
Test_X_combined_codebert = np.hstack([test_codebert_array, Test_X_struct_scaled])

# === Word2Vec ===
def tokenize(text):
    return text.lower().split()
test_tokenized = [tokenize(text) for text in test_data["answers"]]  # replace with your tokenizer
test_word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in test_tokenized])
Test_X_combined_w2v = np.hstack([test_word2vec_array, Test_X_struct_scaled])


100%|██████████| 13/13 [00:01<00:00,  8.97it/s]
100%|██████████| 13/13 [00:03<00:00,  4.06it/s]


In [28]:
test_embeddings = {
    "Word2Vec": Test_X_combined_w2v,
    "TF-IDF": Test_X_tfidf,
    "BERT": Test_X_combined_bert,
    "CodeBERT": Test_X_combined_codebert
}

# Now we will get our data ready to test the test set.
test_data = test_data.fillna(0)  # Fill NaN values in test data

for embed_name, X in test_embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    # Use the same scaler fitted on training data
    if embed_name == "TF-IDF":
        X_test = X  # TF-IDF is already sparse and doesn't need scaling
    else:
        X_test = scaler.transform(X)  # Transform using the scaler fitted on training data
    
    # Predict using the best model from previous grid search
    y_pred = best_model.predict(X_test)
    
    # Save predictions to a DataFrame
    predictions_df = pd.DataFrame({
        'id': test_data['id'],  # Assuming 'id' is the identifier in test_data
        'FinalClass': y_pred
    })
    
    predictions_df.to_csv(f"predictions/predictions_{embed_name}.csv", index=False)
    print(f"Predictions saved for {embed_name} embedding.")


=== Embedding: Word2Vec ===


ValueError: X has 107 features, but StandardScaler is expecting 7 features as input.