In [10]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/train_data_With_features.csv")
test_data = pd.read_csv("data/test_data_with_features.csv")

In [11]:
train_students = pd.read_csv("processed_train_student.csv")
# Rename uid to id to match train_data
train_students = train_students.rename(columns={"UID": "id"})
# Merge train data with student data
merged_df = pd.merge(train_data, train_students, on="id", how="left")
train_data = merged_df.copy()

In [12]:
train_data

Unnamed: 0,id,NaN_count,answers,char_count,word_count,java_keyword_count,method_count,class_count,comment_count,MidtermClass
0,7214465,2,7214465 recursion 1) collatz problemi. bu kısı...,33721,4434,622,229,47,21,7
1,9830856,2,9830856 recursion 1) collatz problemi. bu kısı...,36012,4558,738,241,53,16,9
2,8435496,0,8435496 recursion 1) collatz problemi. bu kısı...,38923,5188,757,243,73,27,16
3,8783086,0,8783086 recursion 1) collatz problemi. bu kısı...,35562,4803,658,224,63,16,10
4,4731282,0,4731282 recursion 1) collatz problemi. bu kısı...,40458,5389,754,239,69,18,10
5,6202193,0,6202193 recursion 1) collatz problemi. bu kısı...,39276,4854,696,231,70,26,18
6,7817361,1,7817361 recursion 1) collatz problemi. bu kısı...,30958,4020,575,205,65,18,11
7,6897869,0,6897869 recursion 1) collatz problemi. bu kısı...,40001,5427,767,232,80,24,17
8,1764665,0,1764665 recursion 1) collatz problemi. bu kısı...,40247,5302,716,241,73,27,11
9,6275288,2,6275288 müzik çalar simülasyonu bil 211 - labo...,34264,4439,619,203,56,25,11


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features if needed
tfidf_embeddings = tfidf_vectorizer.fit_transform(train_data["answers"])

In [17]:
hand_features = ['char_count', 'word_count', 'java_keyword_count', 
                 'method_count', 'class_count', 
                 'NaN_count', 'comment_count']

X_struct = train_data[hand_features].fillna(0).values 

In [15]:
from scipy.sparse import hstack

X_tfidf = hstack([tfidf_embeddings, X_struct])

In [24]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# Load pretrained Sentence-BERT
bert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def get_bert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = bert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state  # (1, seq_len, hidden_size)
            # Average pooling
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

bert_embeddings = get_bert_embedding(train_data["answers"].tolist())

bert_array = np.array(bert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct  # already dense

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Now safely combine them
X_combined_bert = np.hstack([bert_array, X_struct_scaled])

100%|██████████| 44/44 [00:02<00:00, 19.70it/s]
100%|██████████| 44/44 [00:02<00:00, 19.70it/s]


In [23]:
codebert_model_name = "microsoft/codebert-base"
codebert_tokenizer = AutoTokenizer.from_pretrained(codebert_model_name)
codebert_model = AutoModel.from_pretrained(codebert_model_name)

def get_codebert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = codebert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = codebert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

codebert_embeddings = get_codebert_embedding(train_data["answers"].tolist())

codebert_array = np.array(codebert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

X_combined_codebert = np.hstack([codebert_array, X_struct_scaled])

100%|██████████| 44/44 [00:09<00:00,  4.79it/s]
100%|██████████| 44/44 [00:09<00:00,  4.79it/s]


In [22]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

tokenized_texts = train_data["answers"].apply(simple_preprocess).tolist()
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1)

def average_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)
    
word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in tokenized_texts])
    
X_struct_dense = X_struct.toarray() if hasattr(X_struct, "toarray") else X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Combine
X_combined_w2v = np.hstack([word2vec_array, X_struct_scaled])

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

embeddings = {
    "Word2Vec" : X_combined_w2v,
    "TF-IDF": X_tfidf,
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
}

y = train_data["MidtermClass"].values  # Assuming 'score' is the target variable

for name, X in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> MSE: {mse:.4f}, R²: {r2:.4f}")

Word2Vec -> MSE: 26.0537, R²: -0.2774
TF-IDF -> MSE: 15.9148, R²: 0.2197
BERT -> MSE: 33.1417, R²: -0.6250
CodeBERT -> MSE: 39.5345, R²: -0.9384


In [27]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "TF-IDF": X_tfidf,
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
    "Word2Vec": X_combined_w2v
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["MidtermClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: TF-IDF ===
LinearRegression     | MSE: 15.915
Ridge                | MSE: 21.056
Lasso                | MSE: 20.376
ElasticNet           | MSE: 20.749
DecisionTree         | MSE: 42.667
RandomForest         | MSE: 18.773
RandomForest         | MSE: 18.773
GradientBoosting     | MSE: 29.275
SVR                  | MSE: 18.835
KNN                  | MSE: 14.311
MLP                  | MSE: 201.265

=== Embedding: BERT ===
LinearRegression     | MSE: 33.142
Ridge                | MSE: 19.586
Lasso                | MSE: 17.801
ElasticNet           | MSE: 17.311
DecisionTree         | MSE: 31.111
GradientBoosting     | MSE: 29.275
SVR                  | MSE: 18.835
KNN                  | MSE: 14.311
MLP                  | MSE: 201.265

=== Embedding: BERT ===
LinearRegression     | MSE: 33.142
Ridge                | MSE: 19.586
Lasso                | MSE: 17.801
ElasticNet           | MSE: 17.311
DecisionTree         | MSE: 31.111
RandomForest         | MSE: 19.025
RandomFores

[WinError 2] Sistem belirtilen dosyayı bulamıyor
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\subprocess.py", line 1436, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


RandomForest         | MSE: 19.764
GradientBoosting     | MSE: 15.920
SVR                  | MSE: 19.827
KNN                  | MSE: 13.511
MLP                  | MSE: 18.393

=== Embedding: Word2Vec ===
LinearRegression     | MSE: 26.054
Ridge                | MSE: 19.382
Lasso                | MSE: 17.801
ElasticNet           | MSE: 17.311
DecisionTree         | MSE: 12.333
RandomForest         | MSE: 13.361
GradientBoosting     | MSE: 15.920
SVR                  | MSE: 19.827
KNN                  | MSE: 13.511
MLP                  | MSE: 18.393

=== Embedding: Word2Vec ===
LinearRegression     | MSE: 26.054
Ridge                | MSE: 19.382
Lasso                | MSE: 17.801
ElasticNet           | MSE: 17.311
DecisionTree         | MSE: 12.333
RandomForest         | MSE: 13.361
GradientBoosting     | MSE: 16.147
SVR                  | MSE: 17.964
KNN                  | MSE: 11.507
MLP                  | MSE: 15.407
GradientBoosting     | MSE: 16.147
SVR                  | MSE: 17.9

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()

mlp = MLPRegressor(random_state=42)

pipeline = Pipeline([
    ('scaler', scaler),
    ('mlp', mlp)
])

param_grid = {
    'mlp__hidden_layer_sizes': [(100,), (50, 50), (100, 50), (64, 64, 32)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.0001, 0.001, 0.01],  # L2 penalty
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__max_iter': [1000, 2000]
}

for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best parameters:", grid.best_params_)
    print("Best score (negative MSE):", grid.best_score_)

    best_model = grid.best_estimator_

    from sklearn.metrics import mean_squared_error
    y_pred = best_model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print("Validation MSE:", mse)


=== Embedding: TF-IDF ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


ValueError: 
All the 480 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\pipeline.py", line 588, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\pipeline.py", line 1551, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\preprocessing\_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Talha\miniconda3\envs\bitirme\lib\site-packages\sklearn\preprocessing\_data.py", line 959, in partial_fit
    raise ValueError(
ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.


In [31]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
word2vec_array = scaler.fit_transform(X_combined_w2v)
bert_array = scaler.fit_transform(X_combined_bert)
codebert_array = scaler.fit_transform(X_combined_codebert)
#tfidf_array = scaler.fit_transform(X_tfidf)

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "BERT": bert_array,
    "CodeBERT": codebert_array,
    "Word2Vec": word2vec_array
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["MidtermClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: BERT ===
LinearRegression     | MSE: 26.590
Ridge                | MSE: 23.606
Lasso                | MSE: 17.692
ElasticNet           | MSE: 17.006
DecisionTree         | MSE: 42.778
RandomForest         | MSE: 20.752
RandomForest         | MSE: 20.752
GradientBoosting     | MSE: 23.058
SVR                  | MSE: 19.766
KNN                  | MSE: 30.427
GradientBoosting     | MSE: 23.058
SVR                  | MSE: 19.766
KNN                  | MSE: 30.427
MLP                  | MSE: 27.186

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 35.382
Ridge                | MSE: 34.055
Lasso                | MSE: 17.335
ElasticNet           | MSE: 19.361
DecisionTree         | MSE: 23.444
MLP                  | MSE: 27.186

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 35.382
Ridge                | MSE: 34.055
Lasso                | MSE: 17.335
ElasticNet           | MSE: 19.361
DecisionTree         | MSE: 23.444
RandomForest         | MSE: 22.013
RandomF