In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report

In [2]:
tf.keras.backend.clear_session()

In [3]:
df = pd.read_csv('/Users/yelderiny/Projects/Dissertation/Data/processed-data3.csv')
df.head()

Unnamed: 0,pull_requests,size,contributors,age,contributor_xp1,contributor_xp2,contributor_xp3,language_C++,language_Go,language_Java,language_JavaScript,language_Python,language_Swift,language_TypeScript,pr_points1,pr_points2,pr_points3
0,-0.299689,-0.126432,-0.605191,-1.390139,-0.721174,-0.710998,-0.684408,0,0,1,0,0,0,0,2.271,2.389,2.377
1,-0.276385,-0.23416,-0.649719,-1.466477,-0.322326,-0.345167,-0.59773,0,0,1,0,0,0,0,3.39,3.318,2.749
2,-0.254689,-0.124061,-0.320208,-0.155424,0.060942,0.036422,-0.041027,0,0,1,0,0,0,0,4.006,3.668,3.544
3,-0.323796,-0.239047,-0.756588,-0.782989,-0.652968,-0.668599,-0.811847,0,0,1,0,0,0,0,3.281,2.569,3.257
4,-0.311742,-0.194809,-0.551756,-0.751034,-0.753026,-0.761651,-0.831452,0,0,1,0,0,0,0,4.867,4.833,5.186


In [4]:
num_bins = int(np.ceil(np.log2(len(df['pr_points1'])) + 1))
print(num_bins)

12


In [5]:
df['pr_points_bucket'], bins = pd.cut(df['pr_points1'], bins=num_bins, retbins=True, labels=False)
df.head()

Unnamed: 0,pull_requests,size,contributors,age,contributor_xp1,contributor_xp2,contributor_xp3,language_C++,language_Go,language_Java,language_JavaScript,language_Python,language_Swift,language_TypeScript,pr_points1,pr_points2,pr_points3,pr_points_bucket
0,-0.299689,-0.126432,-0.605191,-1.390139,-0.721174,-0.710998,-0.684408,0,0,1,0,0,0,0,2.271,2.389,2.377,0
1,-0.276385,-0.23416,-0.649719,-1.466477,-0.322326,-0.345167,-0.59773,0,0,1,0,0,0,0,3.39,3.318,2.749,0
2,-0.254689,-0.124061,-0.320208,-0.155424,0.060942,0.036422,-0.041027,0,0,1,0,0,0,0,4.006,3.668,3.544,0
3,-0.323796,-0.239047,-0.756588,-0.782989,-0.652968,-0.668599,-0.811847,0,0,1,0,0,0,0,3.281,2.569,3.257,0
4,-0.311742,-0.194809,-0.551756,-0.751034,-0.753026,-0.761651,-0.831452,0,0,1,0,0,0,0,4.867,4.833,5.186,0


In [6]:
features = df.drop(columns=['pr_points1', 'pr_points2', 'pr_points3', 'contributor_xp1', 'contributor_xp3', 'pr_points_bucket'], axis=1)
target = df['pr_points_bucket']

In [7]:
def build_and_compile_model():
    model = keras.Sequential([
        keras.Input(shape=(features.shape[1],)),
        layers.Dropout(0.3),
        layers.Dense(128, activation='elu'),
        layers.Dense(num_bins, activation='softmax')
    ])
    
    model.compile(
        optimizer=keras.optimizers.RMSprop(0.001),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy', 'sparse_categorical_accuracy']
    )
    
    model.summary()
    
    return model

In [8]:
model = build_and_compile_model()

In [9]:
def extract_f1(report):
    lines = report.split('\n')
    for line in reversed(lines):
        if 'weighted avg' in line:
            parts = line.split()
            if len(parts) >= 4:
                return float(parts[4])
    return 0.0

In [10]:
reports = []
for i in range(1000):
    print(f"Random State: {i}")
    
    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=i)
    
    # Create datasets
    dataset_train = tf.data.Dataset.from_tensor_slices((x_train.values, y_train.values)).shuffle(buffer_size=1024).batch(64)
    dataset_val = tf.data.Dataset.from_tensor_slices((x_test.values, y_test.values)).batch(64)
    
    # Training
    training_history = model.fit(dataset_train, epochs=10, validation_data=dataset_val, verbose=0)
    
    # Evaluation
    y_pred = model.predict(x_test)
    y_pred_argmax = np.argmax(y_pred, axis=1)
    
    # Metrics calculation
    report = classification_report(y_test, y_pred_argmax, zero_division=0)
    train_loss = training_history.history['loss'][-1]
    test_loss = training_history.history['val_loss'][-1]
    
    # Append to reports
    reports.append((i, report, train_loss, test_loss))

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 306us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 310us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 292us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 293us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 314us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301us/step
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296us/step
[1m13/13[0m [32m━━━━━━━

In [None]:
# Sort reports by F1-score
reports.sort(key=lambda x: extract_f1(x[1]), reverse=True

# Print the top 10 reports
for seed, report, train_loss, test_loss in reports[:50]:
    print(f"Random State: {seed} Train Loss: {train_loss} Test Loss: {test_loss}")
    print(report)