In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import classification_report

In [2]:
tf.keras.backend.clear_session()

In [3]:
df = pd.read_csv('/Users/yelderiny/Projects/Dissertation/Data/processed-data3-outliers-capped.csv')
df.head()

Unnamed: 0,pull_requests,size,contributors,age,contributor_xp1,contributor_xp2,contributor_xp3,language_C++,language_Go,language_Java,language_JavaScript,language_Python,language_Swift,language_TypeScript,pr_points1,pr_points2,pr_points3
0,-0.441351,-0.155899,-0.60745,-1.390139,-0.760476,-0.749159,-0.714005,0,0,1,0,0,0,0,2.271,2.389,2.377
1,-0.400284,-0.376812,-0.652329,-1.466477,-0.328626,-0.353298,-0.621405,0,0,1,0,0,0,0,3.39,3.318,2.749
2,-0.36205,-0.151038,-0.320222,-0.155424,0.086355,0.059615,-0.026675,0,0,1,0,0,0,0,4.006,3.668,3.544
3,-0.483834,-0.386834,-0.76004,-0.782989,-0.686626,-0.70328,-0.850148,0,0,1,0,0,0,0,3.281,2.569,3.257
4,-0.462592,-0.296117,-0.553595,-0.751034,-0.794964,-0.80397,-0.871093,0,0,1,0,0,0,0,4.867,4.833,5.186


In [4]:
num_bins = int(np.ceil(np.log2(len(df['pr_points1'])) + 1))
print(num_bins)

12


In [5]:
df['pr_points_bucket'], bins = pd.cut(df['pr_points1'], bins=num_bins, retbins=True, labels=False)
df.head()

Unnamed: 0,pull_requests,size,contributors,age,contributor_xp1,contributor_xp2,contributor_xp3,language_C++,language_Go,language_Java,language_JavaScript,language_Python,language_Swift,language_TypeScript,pr_points1,pr_points2,pr_points3,pr_points_bucket
0,-0.441351,-0.155899,-0.60745,-1.390139,-0.760476,-0.749159,-0.714005,0,0,1,0,0,0,0,2.271,2.389,2.377,0
1,-0.400284,-0.376812,-0.652329,-1.466477,-0.328626,-0.353298,-0.621405,0,0,1,0,0,0,0,3.39,3.318,2.749,1
2,-0.36205,-0.151038,-0.320222,-0.155424,0.086355,0.059615,-0.026675,0,0,1,0,0,0,0,4.006,3.668,3.544,1
3,-0.483834,-0.386834,-0.76004,-0.782989,-0.686626,-0.70328,-0.850148,0,0,1,0,0,0,0,3.281,2.569,3.257,1
4,-0.462592,-0.296117,-0.553595,-0.751034,-0.794964,-0.80397,-0.871093,0,0,1,0,0,0,0,4.867,4.833,5.186,1


In [6]:
features = df.drop(columns=['pr_points1', 'pr_points2', 'pr_points3', 'contributor_xp1', 'contributor_xp3', 'pr_points_bucket'], axis=1)
target = df['pr_points_bucket']

In [7]:
def build_and_compile_model():
    model = keras.Sequential([
        keras.Input(shape=(features.shape[1],)),
        layers.Dropout(0.3),
        layers.Dense(128, activation='elu'),
        layers.Dense(num_bins, activation='softmax')
    ])
    
    model.compile(
        optimizer=keras.optimizers.RMSprop(0.001),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy', 'sparse_categorical_accuracy']
    )
    
    model.summary()
    
    return model

In [8]:
model = build_and_compile_model()

In [9]:
def extract_f1(report):
    lines = report.split('\n')
    for line in reversed(lines):
        if 'weighted avg' in line:
            parts = line.split()
            if len(parts) >= 4:
                return float(parts[4])
    return 0.0

In [10]:
reports = []
for i in range(1000):
    print(f"Random State: {i}")
    
    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=i)
    
    # Create datasets
    dataset_train = tf.data.Dataset.from_tensor_slices((x_train.values, y_train.values)).shuffle(buffer_size=1024).batch(64)
    dataset_val = tf.data.Dataset.from_tensor_slices((x_test.values, y_test.values)).batch(64)
    
    # Training
    training_history = model.fit(dataset_train, epochs=10, validation_data=dataset_val, verbose=0)
    
    # Evaluation
    y_pred = model.predict(x_test)
    y_pred_argmax = np.argmax(y_pred, axis=1)
    
    # Metrics calculation
    report = classification_report(y_test, y_pred_argmax, zero_division=0)
    train_loss = training_history.history['loss'][-1]
    test_loss = training_history.history['val_loss'][-1]
    
    # Append to reports
    reports.append((i, report, train_loss, test_loss))

Random State: 0
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Random State: 1
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 396us/step
Random State: 2
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 363us/step
Random State: 3
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 348us/step
Random State: 4
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386us/step
Random State: 5
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 392us/step
Random State: 6
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 354us/step
Random State: 7
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 318us/step
Random State: 8
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452us/step
Random State: 9
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 426us/step
Random State: 10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 410us/step

In [12]:
# Sort reports by F1-score
reports.sort(key=lambda x: extract_f1(x[1]), reverse=True)

# Print the top 10 reports
for seed, report, train_loss, test_loss in reports[:50]:
    print(f"Random State: {seed} Train Loss: {train_loss} Test Loss: {test_loss}")
    print(report)

Random State: 113 Train Loss: 1.5279420614242554 Test Loss: 1.3928841352462769
              precision    recall  f1-score   support

           0       0.79      0.72      0.76       116
           1       0.46      0.73      0.56       109
           2       0.41      0.30      0.34        61
           3       0.36      0.22      0.27        41
           4       0.29      0.41      0.34        22
           5       0.50      0.18      0.27        11
           6       0.75      0.30      0.43        10
           7       0.12      0.11      0.12         9
           8       0.00      0.00      0.00         5
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         3
          11       1.00      0.29      0.44        14

    accuracy                           0.52       403
   macro avg       0.39      0.27      0.29       403
weighted avg       0.54      0.52      0.50       403

Random State: 130 Train Loss: 1.5099879503250122 Test 