In [33]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/train_dataset.csv")
test_data = pd.read_csv("data/test_dataset.csv")

In [34]:
train_students = pd.read_csv("processed_train_student.csv")
# Rename uid to id to match train_data
train_students = train_students.rename(columns={"UID": "id"})
# Merge train data with student data
merged_df = pd.merge(train_data, train_students, on="id", how="left")
train_data = merged_df.copy()

In [35]:
train_data

Unnamed: 0,id,answers,MidtermClass
0,7214465,recursion 1) collatz problemi. bu kısımda amac...,7
1,9830856,recursion 1) collatz problemi. bu kısımda amac...,9
2,8435496,recursion 1) collatz problemi. bu kısımda amac...,16
3,8783086,recursion 1) collatz problemi. bu kısımda amac...,10
4,4731282,recursion 1) collatz problemi. bu kısımda amac...,10
5,6202193,recursion 1) collatz problemi. bu kısımda amac...,18
6,7817361,recursion 1) collatz problemi. bu kısımda amac...,11
7,6897869,recursion 1) collatz problemi. bu kısımda amac...,17
8,1764665,recursion 1) collatz problemi. bu kısımda amac...,11
9,6275288,müzik çalar simülasyonu bil 211 - laboratuvar ...,11


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features if needed
tfidf_embeddings = tfidf_vectorizer.fit_transform(train_data["answers"])

In [37]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# Load pretrained Sentence-BERT
bert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def get_bert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = bert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state  # (1, seq_len, hidden_size)
            # Average pooling
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

bert_embeddings = get_bert_embedding(train_data["answers"].tolist())

100%|██████████| 44/44 [00:02<00:00, 19.82it/s]


In [38]:
codebert_model_name = "microsoft/codebert-base"
codebert_tokenizer = AutoTokenizer.from_pretrained(codebert_model_name)
codebert_model = AutoModel.from_pretrained(codebert_model_name)

def get_codebert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = codebert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = codebert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

codebert_embeddings = get_codebert_embedding(train_data["answers"].tolist())

100%|██████████| 44/44 [00:08<00:00,  4.90it/s]


In [39]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

tokenized_texts = train_data["answers"].apply(simple_preprocess).tolist()
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1)

def average_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

In [40]:
word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in tokenized_texts])
tfidf_array = tfidf_embeddings.toarray()
bert_array = np.array(bert_embeddings)
codebert_array = np.array(codebert_embeddings)

In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [42]:
# Example: using CodeBERT
X = codebert_array  # or bert_array or tfidf_array
y = train_data["MidtermClass"].values  # Replace 'grade' with your actual target column

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
model = LinearRegression()
model.fit(X_train, y_train)

In [45]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

Mean Squared Error: 19.8578
R^2 Score: 0.0263


In [46]:
embeddings = {
    "Word2Vec" : word2vec_array,
    "TF-IDF": tfidf_array,
    "BERT": bert_array,
    "CodeBERT": codebert_array,
}

for name, X in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> MSE: {mse:.4f}, R²: {r2:.4f}")

Word2Vec -> MSE: 14.9960, R²: 0.2647
TF-IDF -> MSE: 11.0951, R²: 0.4560
BERT -> MSE: 19.8578, R²: 0.0263
CodeBERT -> MSE: 19.8578, R²: 0.0263


In [49]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "TF-IDF": tfidf_array,
    "BERT": bert_array,
    "CodeBERT": codebert_array,
    "Word2Vec": word2vec_array
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["MidtermClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: TF-IDF ===
LinearRegression     | MSE: 11.095
Ridge                | MSE: 19.675
Lasso                | MSE: 21.141
ElasticNet           | MSE: 21.141
DecisionTree         | MSE: 56.556
RandomForest         | MSE: 20.288
GradientBoosting     | MSE: 27.903
SVR                  | MSE: 20.847
KNN                  | MSE: 11.316
MLP                  | MSE: 11.274

=== Embedding: BERT ===
LinearRegression     | MSE: 19.858
Ridge                | MSE: 20.006
Lasso                | MSE: 21.141
ElasticNet           | MSE: 21.141
DecisionTree         | MSE: 19.858
RandomForest         | MSE: 19.840
GradientBoosting     | MSE: 19.858
SVR                  | MSE: 20.543
KNN                  | MSE: 21.236
MLP                  | MSE: 19.320

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 19.858
Ridge                | MSE: 19.979
Lasso                | MSE: 21.141
ElasticNet           | MSE: 21.141
DecisionTree         | MSE: 19.858
RandomForest         | MSE: 19.982
GradientB

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()

mlp = MLPRegressor(random_state=42)

pipeline = Pipeline([
    ('scaler', scaler),
    ('mlp', mlp)
])

param_grid = {
    'mlp__hidden_layer_sizes': [(100,), (50, 50), (100, 50), (64, 64, 32)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.0001, 0.001, 0.01],  # L2 penalty
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__max_iter': [1000, 2000]
}

for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best parameters:", grid.best_params_)
    print("Best score (negative MSE):", grid.best_score_)

    best_model = grid.best_estimator_

    from sklearn.metrics import mean_squared_error
    y_pred = best_model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print("Validation MSE:", mse)


=== Embedding: TF-IDF ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -19.394350106782078
Validation MSE: 25.602772686098525

=== Embedding: BERT ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (64, 64, 32), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -18.555610752105714
Validation MSE: 19.710506439208984

=== Embedding: CodeBERT ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (64, 64, 32), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -18.015956783294676
Validation MSE: 19.4760684967041

=== Embedding: Word2Vec ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -10.842060995101928
Validation MSE: 21.618019104003906


In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
word2vec_array = scaler.fit_transform(word2vec_array)
bert_array = scaler.fit_transform(bert_array)
codebert_array = scaler.fit_transform(codebert_array)
tfidf_array = scaler.fit_transform(tfidf_array)

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "TF-IDF": tfidf_array,
    "BERT": bert_array,
    "CodeBERT": codebert_array,
    "Word2Vec": word2vec_array
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["MidtermClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: TF-IDF ===
LinearRegression     | MSE: 24.021
Ridge                | MSE: 23.981
Lasso                | MSE: 25.517
ElasticNet           | MSE: 22.880
DecisionTree         | MSE: 58.000
RandomForest         | MSE: 18.751
GradientBoosting     | MSE: 28.997
SVR                  | MSE: 20.711
KNN                  | MSE: 20.551
MLP                  | MSE: 26.009

=== Embedding: BERT ===
LinearRegression     | MSE: 19.858
Ridge                | MSE: 19.859
Lasso                | MSE: 21.141
ElasticNet           | MSE: 21.141
DecisionTree         | MSE: 19.858
RandomForest         | MSE: 19.897
GradientBoosting     | MSE: 19.858
SVR                  | MSE: 20.543
KNN                  | MSE: 21.213
MLP                  | MSE: 18.545

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 19.858
Ridge                | MSE: 19.857
Lasso                | MSE: 21.141
ElasticNet           | MSE: 21.141
DecisionTree         | MSE: 19.858
RandomForest         | MSE: 19.712
GradientB