In [None]:
import pandas as pd
import numpy as np
import os
import csv
import itertools
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import tqdm
from hyperopt import tpe,hp,Trials
from hyperopt.fmin import fmin
import math

In [None]:
repo_pool = []
for filename in os.listdir('./data_use/'):
    repo_pool.append(os.path.join(filename))
    
base_df = pd.read_csv('./data_use/' + repo_pool[0])
base_df = base_df.drop(columns=['dates'])
print(base_df.columns)

In [None]:
repo_pool = []
for filename in os.listdir('./data_use/'):
    repo_pool.append(os.path.join(filename))
    
base_df = pd.read_csv('./data_use/' + repo_pool[0])
base_df = base_df.drop(columns=['dates'])
target_col = 'monthly_closed_issues_12mo'
real_df_cols = [col for col in base_df.columns]
real_df_cols.append(target_col)
lines = []
projects = []
train_lines = []
test_lines = []
p_names = os.listdir('./data_use/')
#print(repo_pool)
for index, repo in enumerate(repo_pool):
    print(repo)
    project = []
    p_train_lines = []
    p_test_lines = []
    df = pd.read_csv('./data_use/' + repo)
    df = df.drop(columns=['dates'])

    matrix = df.to_numpy()
    for i in range(matrix.shape[0]-12):
        row = []
        for j in range(matrix.shape[1]):
            row.append(matrix[i][j])
        row.append(matrix[i+12][8])
        lines.append(row)
    for i, line in enumerate(lines):
        if i < len(lines)*0.7:
            train_lines.append(line)
            p_train_lines.append(line)
        else:
            test_lines.append(line)
            p_test_lines.append(line)
    print('./datasets/health_'+p_names[index][:-4])
#     p_df = pd.DataFrame(p_train_lines, columns = real_df_cols)
#     p_df.to_csv('./datasets/'+target_col+'/health_'+p_names[index][:-4]+'_train.csv', index=False)
#     p_df_t = pd.DataFrame(p_test_lines, columns = real_df_cols)
#     p_df.to_csv('./datasets/'+target_col+'/health_'+p_names[index][:-4]+'_test.csv', index=False)
    project.append(p_train_lines)
    project.append(p_test_lines)
    projects.append(project)
    lines = []

In [None]:
train_df = pd.DataFrame(train_lines, columns = real_df_cols)

In [None]:
test_df = pd.DataFrame(test_lines, columns = real_df_cols)

In [None]:
test_df.head()

In [None]:
seed=2

def objective(params):
    est=int(params['n_estimators'])
    cr = params['criterion']
    md=int(params['max_depth'])
    msl=int(params['min_samples_leaf'])
    mid=int(params['min_impurity_decrease'])
    model=RandomForestRegressor(n_estimators=est,criterion=cr,max_depth=md,min_samples_leaf=msl,min_impurity_decrease=mid)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    mmre_l = []
    mmreg_l = []
    pred40 = 0
    for y_t, y_p in zip(y_test, y_pred):
        num = np.abs(y_p - y_t)
        den = np.abs(y_t)
        mre = 0
        mreg = 0
        if den == 0:
            if num != 0:
                den+=1
                num+=1
                mre = num/den
                mmre_l.append(mre)
            else:
                mmre_l.append(mre)
        else:
            mre = num/den
            mmre_l.append(mre)
        if mre <= 0.4:
            pred40+=1
        
    MRE = np.median(np.array(mmre_l))
    MREG = np.median(np.array(mmreg_l))
    limit = int(len(y_test)*.7)
    sa_num = mean_absolute_error(y_test,  y_pred)
    y_predg = np.nan_to_num([(np.median(y_pred[:i])) for i in range(len(y_pred))])
    se_den = mean_absolute_error(y_test, y_predg)
    if se_den == 0:
        if sa_num == 0:
            acc = 1
        acc = 1 - ( (sa_num + 1) / (se_den+1) )
    else:
        acc = 1 - ( sa_num / se_den )
    global mre_list
    mre_list.append(MRE)
    global pred40_list
    pred40_list.append(pred40/100)
    global acc_list
    acc_list.append(acc)
    n=3
    val = -math.e**(1 * (MRE - np.median(np.array(mre_list))) / n) 
    val -= - math.e**(-1 * ((pred40/100) - np.median(np.array(pred40_list))) / n)
    val -= - math.e**(-1 * (acc - np.median(np.array(acc_list))) / n)
    return MRE

def optimize(trial):
    params={'n_estimators':hp.uniform('n_estimators',10,200),
           'max_depth':hp.uniform('max_depth',5,20),
            'criterion':hp.choice('criterion', ['squared_error', 'absolute_error']),
           'min_samples_leaf':hp.uniform('min_samples_leaf',1,5),
           'min_impurity_decrease':hp.uniform('min_impurity_decrease',0,10)}
    best=fmin(fn=objective,space=params,algo=tpe.suggest,trials=trial,max_evals=3500,rstate=np.random.default_rng(seed))
    return best



In [None]:
def generateClf(params):
    if len(params) != 5:
        print("Missing params")
        return None
    else:
        criterion = 'squared_error'
        criterion_enum = params['criterion']
        if criterion_enum == 0:
            criterion = 'squared_error'
        elif criterion_enum == 1:
            criterion = 'absolute_error'
        return RandomForestRegressor(n_estimators=int(params['n_estimators']), criterion=criterion, min_samples_leaf=int(params['min_samples_leaf']), min_impurity_decrease=params['min_impurity_decrease'], max_depth=int(params['max_depth']))
    

In [None]:
import warnings
import time
warnings.filterwarnings("ignore")
evals = []
model_data_l = []
index=0
for project in projects:
    eval_row , data_row = [], []
    start_time = time.time()
    global mre_list
    mre_list = [0]
    global pred40_list
    pred40_list = [1]
    global acc_list
    acc_list = [1]
    # pbar.set_description('Running %s' %p_names[index])
    metrics_data = []
    train_df = pd.DataFrame(project[0], columns = real_df_cols)
    test_df = pd.DataFrame(project[1], columns = real_df_cols)
    X_train = train_df.iloc[:, 1:-1]
    X_test = test_df.iloc[:, 1:-1]
    y_train = train_df.iloc[:, -1:].values.flatten().tolist()
    y_test = np.array(test_df.iloc[:, -1:].values.flatten().tolist())
    mre_past = [0]
    pred40_past = [1]
    acc_past = [1]
    trial=Trials()
    params=optimize(trial)
    clf = generateClf(params)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)
    mmre_l = []
    mmreg_l = []
    pred40 = 0
    i = 0
    for y_t, y_p in zip(y_test, y_pred):
        num = np.abs(y_p - y_t)
        den = np.abs(y_t)
        mre = 0
        mreg = 0
        if den == 0:
            if num != 0:
                den+=1
                num+=1
                mre = num/den
                mmre_l.append(mre)
            else:
                mmre_l.append(mre)
        else:
            mre = num/den
            mmre_l.append(mre)
        if mre <= 0.4:
            pred40+=1
        i+=1
    MRE = np.median(np.array(mmre_l))
    MREG = np.median(np.array(mmreg_l))
    # print("MRES=",np.array(mmre_l))
    data_row.append(p_names[index])
    data_row.append('hyperopt')
    data_row.append(5000)
    criterion = 'poisson'
    criterion_enum = params['criterion']
    if criterion_enum == 0:
        criterion = 'squared_error'
    elif criterion_enum == 1:
        criterion = 'absolute_error'
    n_estimators=int(params['n_estimators'])
    min_samples_leaf=int(params['min_samples_leaf'])
    min_impurity_decrease=params['min_impurity_decrease']
    max_depth=int(params['max_depth'])
    data_row.append(n_estimators)
    data_row.append(criterion)
    data_row.append(min_samples_leaf)
    data_row.append(min_impurity_decrease)
    data_row.append(max_depth)
    
    # import pdb;pdb.set_trace()
    eval_row.append(round(MRE * 100, 2))
    limit = int(len(y_test)*.7)
    sa_num = mean_absolute_error(y_test,  y_pred)
    y_predg = np.nan_to_num([(np.median(y_pred[:i])) for i in range(len(y_pred))])
    se_den = mean_absolute_error(y_test, y_predg)
    if se_den == 0:
        if sa_num == 0:
            acc = 1
        acc = 1 - ( (sa_num + 1) / (se_den+1) )
    else:
        acc = 1 - ( sa_num / se_den )
    eval_row.append(pred40)
    eval_row.append(round(acc*100, 2))
    pred40 = pred40 / len(y_pred) * 100
    data_row.append(time.time() - start_time)
    evals.append(eval_row)
    model_data_l.append(data_row)
   
    index+=1


In [None]:
evals_data = pd.DataFrame(evals, columns = ['MdMRE', 'PRED40', 'SA']).fillna(0)

In [None]:
model_data = pd.DataFrame(model_data_l, columns = ['ds', 'Optimizer', 'Models built','N_estimators', 'Criterion', 'Min_samples_leaf','Min_impurity_decrease', 'Max_depth', 'Time']).fillna(0)

In [None]:
eval_data_full = pd.concat([model_data, evals_data], axis=1)

In [None]:
eval_data_full = eval_data_full.fillna(0)

In [None]:
normalized_eval_data=(evals_data-evals_data.min())/(evals_data.max()-evals_data.min())


In [None]:
normalized_eval_data.columns = ['N_MdMRE', 'N_PRED40', 'N_SA']

In [None]:
eval_data_full = pd.concat([normalized_eval_data, evals_data], axis=1)

In [None]:
eval_data_full.head()

In [None]:
all_data_full = pd.concat([eval_data_full, model_data], axis=1)

In [None]:
all_data_full.to_csv('./evals/'+target_col+'/hyperopt_all_data_full_multi.csv', index=False)