In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge

In [2]:
df = pd.read_csv('/Users/yelderiny/Projects/Dissertation/Data/processed-data-swift.csv', index_col=[0])

In [3]:
features1 = df.drop(columns=['pr_points1', 'pr_points2', 'pr_points3', 'contributor_xp2', 'contributor_xp3'], axis=1)
features2 = df.drop(columns=['pr_points1', 'pr_points2', 'pr_points3', 'contributor_xp1', 'contributor_xp3'], axis=1)
features3 = df.drop(columns=['pr_points1', 'pr_points2', 'pr_points3', 'contributor_xp1', 'contributor_xp2'], axis=1)
target1 = df['pr_points1']
target2 = df['pr_points2']
target3 = df['pr_points3']

In [4]:
max_error_scoring = "max_error"
neg_mean_absolute_error_scoring = "neg_mean_absolute_error"
r2_scoring = "r2"
neg_mean_squared_error_scoring = "neg_mean_squared_error"

In [5]:
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor()),
    ('LASSO', Lasso()),
    ('Ridge', Ridge()),
    ('Elastic Net', ElasticNet()),
]

In [6]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features1, target1, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features1, target1, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features1, target1, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features1, target1, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -19.233571575426257 
 mean absolute error: 1.544744289142501 
 r2: -0.26465887284754364 
 mean squared error: 12.978266589934728 
 ------------------------------
Random Forest 
 max error: -15.777982 
 mean absolute error: 1.201259128161312 
 r2: 0.2895025012607019 
 mean squared error: 9.54489694516764 
 ------------------------------
LASSO 
 max error: -16.64762381122294 
 mean absolute error: 1.6590501445436736 
 r2: 0.14678423594060402 
 mean squared error: 10.99687145642513 
 ------------------------------
Ridge 
 max error: -19.19793036753559 
 mean absolute error: 1.5441378115889983 
 r2: -0.25860333285539955 
 mean squared error: 12.936794068425428 
 ------------------------------
Elastic Net 
 max error: -16.354732243094897 
 mean absolute error: 1.5780311313684474 
 r2: 0.20872907826757636 
 mean squared error: 10.268215094252676 
 ------------------------------


In [7]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features1, target2, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features1, target2, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features1, target2, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features1, target2, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -14.122710255287208 
 mean absolute error: 1.368090813657369 
 r2: -0.13993921809363713 
 mean squared error: 6.8871097575260745 
 ------------------------------
Random Forest 
 max error: -10.815537999999998 
 mean absolute error: 1.0771665625427205 
 r2: 0.39205503571918904 
 mean squared error: 3.9296295134267907 
 ------------------------------
LASSO 
 max error: -12.741571190936607 
 mean absolute error: 1.5286362215590423 
 r2: 0.06481102905448019 
 mean squared error: 6.120618850579074 
 ------------------------------
Ridge 
 max error: -14.101128993317536 
 mean absolute error: 1.3678371245181562 
 r2: -0.13527280515569032 
 mean squared error: 6.862377196638292 
 ------------------------------
Elastic Net 
 max error: -11.993632313783216 
 mean absolute error: 1.4322927707941515 
 r2: 0.15912062181822392 
 mean squared error: 5.50270213403307 
 ------------------------------


In [8]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features1, target3, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features1, target3, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features1, target3, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features1, target3, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -24.663835431299905 
 mean absolute error: 1.719897901757958 
 r2: -0.37179021247838867 
 mean squared error: 21.239261231350163 
 ------------------------------
Random Forest 
 max error: -25.335840999999995 
 mean absolute error: 1.4081118116883116 
 r2: 0.09972724299787912 
 mean squared error: 17.81745182756477 
 ------------------------------
LASSO 
 max error: -21.36451264422653 
 mean absolute error: 1.7695203541808275 
 r2: 0.16059714741852166 
 mean squared error: 18.40208851360247 
 ------------------------------
Ridge 
 max error: -24.61754829179568 
 mean absolute error: 1.718923801792115 
 r2: -0.3651698499795962 
 mean squared error: 21.185218083312353 
 ------------------------------
Elastic Net 
 max error: -21.25673632879634 
 mean absolute error: 1.7012733743507826 
 r2: 0.20527179185555103 
 mean squared error: 17.548245249864745 
 ------------------------------


In [9]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features2, target1, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features2, target1, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features2, target1, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features2, target1, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -19.24832467166093 
 mean absolute error: 1.5441862397441017 
 r2: -0.26555225744619076 
 mean squared error: 12.987920597877984 
 ------------------------------
Random Forest 
 max error: -16.179333 
 mean absolute error: 1.1874486748120299 
 r2: 0.2861391303315447 
 mean squared error: 8.806632894736257 
 ------------------------------
LASSO 
 max error: -16.64762381122294 
 mean absolute error: 1.6590501445436736 
 r2: 0.14678423594060402 
 mean squared error: 10.99687145642513 
 ------------------------------
Ridge 
 max error: -19.212578099742366 
 mean absolute error: 1.543579512586661 
 r2: -0.2594812283253622 
 mean squared error: 12.946318294628266 
 ------------------------------
Elastic Net 
 max error: -16.355477964496448 
 mean absolute error: 1.5776201931054918 
 r2: 0.20884669943407969 
 mean squared error: 10.266773048165078 
 ------------------------------


In [10]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features2, target2, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features2, target2, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features2, target2, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features2, target2, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -14.13919738927657 
 mean absolute error: 1.3677021194030767 
 r2: -0.14009490850282705 
 mean squared error: 6.889954357379817 
 ------------------------------
Random Forest 
 max error: -10.854612999999999 
 mean absolute error: 1.085605737696514 
 r2: 0.4216714464619316 
 mean squared error: 3.9654908055531743 
 ------------------------------
LASSO 
 max error: -12.741571190936607 
 mean absolute error: 1.5286362215590423 
 r2: 0.06481102905448019 
 mean squared error: 6.120618850579074 
 ------------------------------
Ridge 
 max error: -14.11756869060412 
 mean absolute error: 1.3674475691906385 
 r2: -0.13542859776395733 
 mean squared error: 6.865208520714196 
 ------------------------------
Elastic Net 
 max error: -11.996102901169028 
 mean absolute error: 1.4317693766715922 
 r2: 0.15927244693310816 
 mean squared error: 5.501924185618413 
 ------------------------------


In [11]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features2, target3, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features2, target3, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features2, target3, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features2, target3, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -24.684442729591403 
 mean absolute error: 1.7192586261907803 
 r2: -0.3734805916621974 
 mean squared error: 21.260680284408544 
 ------------------------------
Random Forest 
 max error: -24.443570999999995 
 mean absolute error: 1.4124895411825018 
 r2: 0.0905973209503308 
 mean squared error: 18.61399847280519 
 ------------------------------
LASSO 
 max error: -21.36451264422653 
 mean absolute error: 1.7695203541808275 
 r2: 0.16059714741852166 
 mean squared error: 18.40208851360247 
 ------------------------------
Ridge 
 max error: -24.637889948361327 
 mean absolute error: 1.7182757735383585 
 r2: -0.36682699762515114 
 mean squared error: 21.206301634838997 
 ------------------------------
Elastic Net 
 max error: -21.257492868934328 
 mean absolute error: 1.7008688590006287 
 r2: 0.20537844479439196 
 mean squared error: 17.547805036684572 
 ------------------------------


In [12]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features3, target1, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features3, target1, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features3, target1, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features3, target1, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -19.45140128108755 
 mean absolute error: 1.5439968625219833 
 r2: -0.2807581593184942 
 mean squared error: 13.136660981344615 
 ------------------------------
Random Forest 
 max error: -17.965876 
 mean absolute error: 1.1985889723171563 
 r2: 0.13486277453760706 
 mean squared error: 9.771950986746067 
 ------------------------------
LASSO 
 max error: -16.64762381122294 
 mean absolute error: 1.6590501445436736 
 r2: 0.14678423594060402 
 mean squared error: 10.99687145642513 
 ------------------------------
Ridge 
 max error: -19.413080676112287 
 mean absolute error: 1.5433181410607208 
 r2: -0.27453207199364715 
 mean squared error: 13.093830170482661 
 ------------------------------
Elastic Net 
 max error: -16.35998003829875 
 mean absolute error: 1.5798287215980562 
 r2: 0.20560050962848414 
 mean squared error: 10.29251535687192 
 ------------------------------


In [13]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features3, target2, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features3, target2, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features3, target2, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features3, target2, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -14.354234258389813 
 mean absolute error: 1.3694199736668153 
 r2: -0.1559646571322277 
 mean squared error: 6.989121634827015 
 ------------------------------
Random Forest 
 max error: -11.434987999999997 
 mean absolute error: 1.0922518807245385 
 r2: 0.35217835091360444 
 mean squared error: 4.088491142227208 
 ------------------------------
LASSO 
 max error: -12.741571190936607 
 mean absolute error: 1.5286362215590423 
 r2: 0.06481102905448019 
 mean squared error: 6.120618850579074 
 ------------------------------
Ridge 
 max error: -14.331851347786857 
 mean absolute error: 1.369146936741455 
 r2: -0.15116063602764424 
 mean squared error: 6.963622422129726 
 ------------------------------
Elastic Net 
 max error: -12.061360662217904 
 mean absolute error: 1.432715990448495 
 r2: 0.15645339919918838 
 mean squared error: 5.520804236922848 
 ------------------------------


In [14]:
for name, model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    cv_results1 = cross_val_score(model, features3, target3, cv=kfold, scoring=max_error_scoring)
    cv_results2 = cross_val_score(model, features3, target3, cv=kfold, scoring=neg_mean_absolute_error_scoring)
    cv_results3 = cross_val_score(model, features3, target3, cv=kfold, scoring=r2_scoring)
    cv_results4 = cross_val_score(model, features3, target3, cv=kfold, scoring=neg_mean_squared_error_scoring)
    msg = f"{name} \n max error: {cv_results1.mean()} \n mean absolute error: {-cv_results2.mean()} \n r2: {cv_results3.mean()} \n mean squared error: {-cv_results4.mean()}"
    print(msg, '\n', '-' * 30)

Linear Regression 
 max error: -24.91211130859351 
 mean absolute error: 1.7160896668934384 
 r2: -0.3882739476378915 
 mean squared error: 21.47218355665885 
 ------------------------------
Random Forest 
 max error: -25.452605000000005 
 mean absolute error: 1.4103544429254955 
 r2: -0.2980253890623238 
 mean squared error: 20.841186260219256 
 ------------------------------
LASSO 
 max error: -21.36451264422653 
 mean absolute error: 1.7695203541808275 
 r2: 0.16059714741852166 
 mean squared error: 18.40208851360247 
 ------------------------------
Ridge 
 max error: -24.86538093641748 
 mean absolute error: 1.7150452454378908 
 r2: -0.3814267011433234 
 mean squared error: 21.415878532101363 
 ------------------------------
Elastic Net 
 max error: -21.256879087533274 
 mean absolute error: 1.7045670626861493 
 r2: 0.20317407574629492 
 mean squared error: 17.577509548339755 
 ------------------------------
