In [48]:
import pandas as pd
import numpy as np

train_data = pd.read_csv("data/train_data_With_features.csv")
test_data = pd.read_csv("data/test_data_with_features.csv")

In [49]:
train_students = pd.read_csv("processed_train_student.csv")
# Rename uid to id to match train_data
train_students = train_students.rename(columns={"UID": "id"})
# Merge train data with student data
merged_df = pd.merge(train_data, train_students, on="id", how="left")
train_data = merged_df.copy()

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features if needed
tfidf_embeddings = tfidf_vectorizer.fit_transform(train_data["answers"])

In [51]:
hand_features = ['char_count', 'word_count', 'java_keyword_count', 
                 'method_count', 'class_count', 
                 'NaN_count', 'comment_count']

X_struct = train_data[hand_features].fillna(0).values 

In [52]:
from sklearn.preprocessing import StandardScaler
ScoreScaler = StandardScaler()

In [53]:
from scipy.sparse import hstack

X_tfidf = hstack([tfidf_embeddings, X_struct])

In [54]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

# Load pretrained Sentence-BERT
bert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModel.from_pretrained(bert_model_name)

def get_bert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = bert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state  # (1, seq_len, hidden_size)
            # Average pooling
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

bert_embeddings = get_bert_embedding(train_data["answers"].tolist())

bert_array = np.array(bert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct  # already dense

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Now safely combine them
X_combined_bert = np.hstack([bert_array, X_struct_scaled])

100%|██████████| 44/44 [00:04<00:00, 10.62it/s]


In [55]:
codebert_model_name = "microsoft/codebert-base"
codebert_tokenizer = AutoTokenizer.from_pretrained(codebert_model_name)
codebert_model = AutoModel.from_pretrained(codebert_model_name)

def get_codebert_embedding(texts):
    embeddings = []
    for text in tqdm(texts):
        inputs = codebert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = codebert_model(**inputs)
            last_hidden_state = outputs.last_hidden_state
            embedding = last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append(embedding)
    return embeddings

codebert_embeddings = get_codebert_embedding(train_data["answers"].tolist())

codebert_array = np.array(codebert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

X_combined_codebert = np.hstack([codebert_array, X_struct_scaled])

100%|██████████| 44/44 [00:10<00:00,  4.13it/s]


In [56]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec

tokenized_texts = train_data["answers"].apply(simple_preprocess).tolist()
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1)

def average_vector(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)
    
word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in tokenized_texts])
    
X_struct_dense = X_struct.toarray() if hasattr(X_struct, "toarray") else X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Combine
X_combined_w2v = np.hstack([word2vec_array, X_struct_scaled])

In [57]:
# Create embeddings for test data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # adjust max_features if needed
tfidf_embeddings = tfidf_vectorizer.fit_transform(test_data["answers"])

hand_features = ['char_count', 'word_count', 'java_keyword_count', 
                 'method_count', 'class_count', 
                 'NaN_count', 'comment_count']

X_struct = test_data[hand_features].fillna(0).values 

Test_X_tfidf = hstack([tfidf_embeddings, X_struct])

#---------

bert_embeddings = get_bert_embedding(test_data["answers"].tolist())

bert_array = np.array(bert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct  # already dense

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Now safely combine them
Test_X_combined_bert = np.hstack([bert_array, X_struct_scaled])

#---------

codebert_embeddings = get_codebert_embedding(test_data["answers"].tolist())

codebert_array = np.array(codebert_embeddings)

if hasattr(X_struct, "toarray"):
    X_struct_dense = X_struct.toarray()
else:
    X_struct_dense = X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

Test_X_combined_codebert = np.hstack([codebert_array, X_struct_scaled])

#---------
tokenized_texts = test_data["answers"].apply(simple_preprocess).tolist()

word2vec_array = np.array([average_vector(tokens, w2v_model, 100) for tokens in tokenized_texts])
    
X_struct_dense = X_struct.toarray() if hasattr(X_struct, "toarray") else X_struct

scaler = StandardScaler()
X_struct_scaled = scaler.fit_transform(X_struct_dense)

# Combine
Test_X_combined_w2v = np.hstack([word2vec_array, X_struct_scaled])

100%|██████████| 13/13 [00:01<00:00, 10.01it/s]
100%|██████████| 13/13 [00:03<00:00,  4.08it/s]


In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_train = X_combined_w2v
y_train = train_data["FinalClass"].values  # Assuming 'FinalClass' is the target variable
X_test = Test_X_combined_w2v
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array([ 5.18351869, 41.01678813,  7.56765573, 13.01352715,  6.47608599,
       11.85045307, 17.59043877, 16.97506138, 23.07993855, 10.77043512,
       12.18307473,  8.02037458, 11.46845305])

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

embeddings = {
    "Word2Vec" : X_combined_w2v,
    "TF-IDF": X_tfidf,
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
}

y = train_data["FinalClass"].values  # Assuming 'score' is the target variable
#y = ScoreScaler.fit_transform(y.reshape(-1, 1)).flatten()  # Scale the target variable

for name, X in embeddings.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> MSE: {mse:.4f}, R²: {r2:.4f}")

Word2Vec -> MSE: 54.2164, R²: -2.7663
TF-IDF -> MSE: 25.9406, R²: -0.8020
BERT -> MSE: 212.6893, R²: -13.7752
CodeBERT -> MSE: 191.9950, R²: -12.3376


In [62]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

"""# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "TF-IDF": X_tfidf,
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
    "Word2Vec": X_combined_w2v
}"""

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["FinalClass"].values

word2vec_array = scaler.fit_transform(X_combined_w2v)
bert_array = scaler.fit_transform(X_combined_bert)
codebert_array = scaler.fit_transform(X_combined_codebert)

embeddings = {
    "BERT": bert_array,
    "CodeBERT": codebert_array,
    "Word2Vec": word2vec_array
}


# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train = X
    y_train = y

    if embed_name == "TF-IDF":
        X_test = Test_X_tfidf
    elif embed_name == "BERT":
        X_test = Test_X_combined_bert
    elif embed_name == "CodeBERT":
        X_test = Test_X_combined_codebert
    elif embed_name == "Word2Vec":
        X_test = Test_X_combined_w2v

    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #y_pred = ScoreScaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

        # Combine with test data write to CSV
        test_data["FinalClass"] = y_pred
        test_data = test_data[["id","FinalClass"]]
        test_data.to_csv(f"predictions/{embed_name}_{model_name}_predictions_scaled.csv", index=False)




=== Embedding: BERT ===

=== Embedding: CodeBERT ===

=== Embedding: Word2Vec ===


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

scaler = StandardScaler()

mlp = MLPRegressor(random_state=42)

embeddings = {
    "BERT": X_combined_bert,
    "CodeBERT": X_combined_codebert,
    "Word2Vec": X_combined_w2v
}


pipeline = Pipeline([
    ('scaler', scaler),
    ('mlp', mlp)
])

param_grid = {
    'mlp__hidden_layer_sizes': [(100,), (50, 50), (100, 50), (64, 64, 32)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.0001, 0.001, 0.01],  # L2 penalty
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__max_iter': [1000, 2000]
}

for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best parameters:", grid.best_params_)
    print("Best score (negative MSE):", grid.best_score_)

    best_model = grid.best_estimator_

    from sklearn.metrics import mean_squared_error
    y_pred = best_model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print("Validation MSE:", mse)


=== Embedding: BERT ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (50, 50), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -50.19595597593614
Validation MSE: 30.38662233154869

=== Embedding: CodeBERT ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  arr = np.array(param_list)


Best parameters: {'mlp__activation': 'tanh', 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (100,), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -37.376400625931446
Validation MSE: 10.122734490111133

=== Embedding: Word2Vec ===
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (64, 64, 32), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 1000, 'mlp__solver': 'adam'}
Best score (negative MSE): -22.60937962132116
Validation MSE: 46.438678025654795


  arr = np.array(param_list)


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
word2vec_array = scaler.fit_transform(X_combined_w2v)
bert_array = scaler.fit_transform(X_combined_bert)
codebert_array = scaler.fit_transform(X_combined_codebert)
#tfidf_array = scaler.fit_transform(X_tfidf)

# Embedding dictionaries: add your actual numpy arrays here
embeddings = {
    "BERT": bert_array,
    "CodeBERT": codebert_array,
    "Word2Vec": word2vec_array
}

# Regression models to try
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "DecisionTree": DecisionTreeRegressor(),
    "RandomForest": RandomForestRegressor(),
    "GradientBoosting": GradientBoostingRegressor(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "MLP": MLPRegressor(max_iter=10000)
}

# Target values
y = train_data["FinalClass"].values

# Run all combos
for embed_name, X in embeddings.items():
    print(f"\n=== Embedding: {embed_name} ===")
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    for model_name, model in regressors.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mse = mean_squared_error(y_val, y_pred)

        print(f"{model_name:<20} | MSE: {mse:.3f}")


=== Embedding: BERT ===
LinearRegression     | MSE: 248.131
Ridge                | MSE: 28.938
Lasso                | MSE: 9.821
ElasticNet           | MSE: 9.109
DecisionTree         | MSE: 22.222
RandomForest         | MSE: 6.364
GradientBoosting     | MSE: 7.903
SVR                  | MSE: 18.959
KNN                  | MSE: 11.938
MLP                  | MSE: 27.103

=== Embedding: CodeBERT ===
LinearRegression     | MSE: 197.692
Ridge                | MSE: 18.696
Lasso                | MSE: 10.109
ElasticNet           | MSE: 7.956
DecisionTree         | MSE: 31.111
RandomForest         | MSE: 6.288
GradientBoosting     | MSE: 8.655
SVR                  | MSE: 18.894
KNN                  | MSE: 11.818
MLP                  | MSE: 8.500

=== Embedding: Word2Vec ===
LinearRegression     | MSE: 38.895
Ridge                | MSE: 18.057
Lasso                | MSE: 5.087
ElasticNet           | MSE: 5.675
DecisionTree         | MSE: 20.889
RandomForest         | MSE: 6.913
GradientBoosting