In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv('/Users/yelderiny/Projects/Dissertation/Data/processed-data3-outliers-capped.csv')
df.head()

Unnamed: 0,pull_requests,size,contributors,age,contributor_xp1,contributor_xp2,contributor_xp3,language_C++,language_Go,language_Java,language_JavaScript,language_Python,language_Swift,language_TypeScript,pr_points1,pr_points2,pr_points3
0,-0.441351,-0.155899,-0.60745,-1.390139,-0.760476,-0.749159,-0.714005,0,0,1,0,0,0,0,2.271,2.389,2.377
1,-0.400284,-0.376812,-0.652329,-1.466477,-0.328626,-0.353298,-0.621405,0,0,1,0,0,0,0,3.39,3.318,2.749
2,-0.36205,-0.151038,-0.320222,-0.155424,0.086355,0.059615,-0.026675,0,0,1,0,0,0,0,4.006,3.668,3.544
3,-0.483834,-0.386834,-0.76004,-0.782989,-0.686626,-0.70328,-0.850148,0,0,1,0,0,0,0,3.281,2.569,3.257
4,-0.462592,-0.296117,-0.553595,-0.751034,-0.794964,-0.80397,-0.871093,0,0,1,0,0,0,0,4.867,4.833,5.186


In [3]:
num_bins = int(np.ceil(np.log2(len(df['pr_points1'])) + 1))
print(num_bins)

12


In [4]:
# Sturges' Rule 
df['pr_points_bucket'], bins = pd.cut(df['pr_points1'], bins=num_bins, retbins=True, labels=False)
print(bins)

[ 0.97474367  3.10469385  5.2093877   7.31408155  9.41877541 11.52346926
 13.62816311 15.73285696 17.83755081 19.94224466 22.04693851 24.15163236
 26.25632622]


In [5]:
features = df.drop(columns=['pr_points1', 'pr_points2', 'pr_points3', 'contributor_xp1', 'contributor_xp3', 'pr_points_bucket'], axis=1)
target = df['pr_points_bucket']

In [6]:
results = {}

In [7]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)  # As a fraction
    num_acc = accuracy_score(y_test, y_pred, normalize=False)  # As a count
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    return {'accuracy': acc, 'precision': prec, 'recall': recall, 'f1': f1, 'accuracy_count': num_acc}

In [8]:
def build_model(classifier_fn, name_of_y_col, name_of_x_cols, dataset, test_frac=0.2):
    x_train, x_test, y_train, y_test = train_test_split(dataset[name_of_x_cols], dataset[name_of_y_col], test_size=test_frac, random_state=557)
        
    model = classifier_fn(x_train, y_train)
    
    y_pred_train, y_pred = model.predict(x_train), model.predict(x_test)
    
    train_summary, test_summary = summarize_classification(y_train, y_pred_train), summarize_classification(y_test, y_pred)
    
    model_crosstab = pd.crosstab(pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}).y_pred, y_test)
    
    return {'test': test_summary, 'confusion_matrix': model_crosstab}

In [9]:
def compare_results():
    for key in results:
        print('Classification: ', key)
        
        # print()
        # print('Training data')
        # for score in results[key]['training']:
        #     print(score, results[key]['training'][score])
            
        print()
        print('Test data')
        for score in results[key]['test']:
            print(score, results[key]['test'][score])
        
        print()

In [10]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [11]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=10000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

In [13]:
def radius_neighbors_fn(x_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [14]:
def decision_tree_fn(x_train, y_train, max_depth=5, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features, min_samples_leaf=4, min_samples_split=2, criterion='entropy')
    model.fit(x_train, y_train)
    
    return model

In [15]:
results['logistic'] = build_model(logistic_fn, 'pr_points_bucket', features.columns, df)
results['lda'] = build_model(linear_discriminant_fn, 'pr_points_bucket', features.columns, df)
results['svm'] = build_model(linear_svc_fn, 'pr_points_bucket', features.columns, df)
results['radius_neighbors'] = build_model(radius_neighbors_fn, 'pr_points_bucket', features.columns, df)
results['decision_tree'] = build_model(decision_tree_fn, 'pr_points_bucket', features.columns, df)

compare_results()

Classification:  logistic

Test data
accuracy 0.4119106699751861
precision 0.3385476012636035
recall 0.4119106699751861
f1 0.3433350664948463
accuracy_count 166.0

Classification:  lda

Test data
accuracy 0.4143920595533499
precision 0.32091281910525515
recall 0.4143920595533499
f1 0.3449744403705425
accuracy_count 167.0

Classification:  svm

Test data
accuracy 0.40198511166253104
precision 0.2839236997578307
recall 0.40198511166253104
f1 0.31321530332951847
accuracy_count 162.0

Classification:  radius_neighbors

Test data
accuracy 0.315136476426799
precision 0.09931099877469843
recall 0.315136476426799
f1 0.1510276698347301
accuracy_count 127.0

Classification:  decision_tree

Test data
accuracy 0.4665012406947891
precision 0.4136378758187792
recall 0.4665012406947891
f1 0.43322854520756743
accuracy_count 188.0



In [16]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=557)

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 4,
 'min_samples_split': 2}