Import the required libraries.

In [1]:
import itertools
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def split_partitions(DATA,TARGETS,IDS, folds):
    num_val_samples = len(DATA) // folds+1
    one_fold = []
    nine_folds = []
    for i in range(folds):
        one_fold_data = DATA[i * num_val_samples: (i + 1) * num_val_samples] # prepares the validation data: data from partition # k
        one_fold_targets = TARGETS[i * num_val_samples: (i + 1) * num_val_samples]
        one_fold_IDs = IDS[i * num_val_samples: (i + 1) * num_val_samples]
        one_fold += [[one_fold_data, one_fold_targets, one_fold_IDs]]
        
        # prepares the training data: data from all other partitions
        nine_fold_data = np.concatenate([DATA[:i * num_val_samples],DATA[(i + 1) * num_val_samples:]],axis=0)
        nine_fold_targets = np.concatenate([TARGETS[:i * num_val_samples],TARGETS[(i + 1) * num_val_samples:]],axis=0)
        nine_fold_IDs = np.concatenate([IDS[:i * num_val_samples],IDS[(i + 1) * num_val_samples:]],axis=0)
        nine_folds += [[nine_fold_data,nine_fold_targets,nine_fold_IDs]]
    return one_fold, nine_folds   

In [None]:
# Load the dataset and shuffle 

directory = "Data/"
df = pd.read_csv(directory+"sample_dataset.csv", sep=',')
df = shuffle(df)
print(df)

In [None]:
pre_all_data    = df.loc[:, df.columns != 'Aff'].drop(df.columns[0],axis=1)

print(pre_all_data)

In [None]:
# preprocess data, normalize columns
imputer      = SimpleImputer()
scaler       = preprocessing.MinMaxScaler()
all_data     = scaler.fit_transform(imputer.fit_transform(pre_all_data)) 

print("all_data :", all_data.shape, "all_data_type: ", type(all_data))
print(all_data)

In [None]:
outer_k = 10

test_fold,train_fold = split_partitions(all_data,df['Aff'],df['ID'],outer_k)

In [None]:
outerCV__targets = []
outerCV__predictions = []
outerCV__IDs = [] 

cv_frame = pd.DataFrame()

for i in range(outer_k):

    outer_train = train_fold[i]
    outer_test  = test_fold[i]

    outerCV__test_data, outerCV__test_targets, outerCV__test_ids  = test_fold[i][0], test_fold[i][1], test_fold[i][2] 
    outerCV__train_data, outerCV__train_targets, outerCV__train_ids = train_fold[i][0], train_fold[i][1], train_fold[i][2]


    cv_rf = RandomForestRegressor(n_estimators= 1600, max_depth = 90, 
                           max_features = 'auto', min_samples_leaf = 1, 
                           min_samples_split = 5, bootstrap = True, criterion="mae", n_jobs = 10)
    # Fit the random search model
    cv_rf.fit(outerCV__train_data,outerCV__train_targets)

    outerCV__test_predictions = cv_rf.predict(outerCV__test_data).tolist()
    outerCV__predictions.append(outerCV__test_predictions)
    outerCV__targets.append(outerCV__test_targets)
    outerCV__IDs.append(outerCV__test_ids)
    del cv_rf, outerCV__test_predictions

In [None]:
outerCV__targets_combined  = list(itertools.chain.from_iterable(outerCV__targets))
outerCV__predictions_combined = list(itertools.chain.from_iterable(outerCV__predictions))
outerCV__IDs_combined = list(itertools.chain.from_iterable(outerCV__IDs))

cv_frame['IDs'] = outerCV__IDs_combined
cv_frame['ExperimentalAff'] = outerCV__targets_combined
cv_frame['PredictedAff'] = outerCV__predictions_combined

cv_frame.to_csv(directory+'RF_CV_BestModel_Predictions.csv',index=False)

In [None]:
rms = mean_squared_error(outerCV__targets_combined, outerCV__predictions_combined, squared=False)
print("rms error is: " + str(rms))


r2 = r2_score(outerCV__targets_combined, outerCV__predictions_combined)
print("r2 value is: " + str(r2))