In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train_1 = pd.read_csv('../Data/ACT1_train_450.csv', dtype={"MOLECULE": object, "Act": float})

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
y = train_1['Act'].values
train_1 = train_1.drop(['Act', 'Unnamed: 0'], axis = 1)
x = train_1.values
X_train, X_test, Y_train, Y_test = train_test_split(x, y, train_size = 0.80, random_state = 0)





In [4]:
print(np.shape(Y_train), np.shape(X_train), np.shape(Y_test), np.shape(X_test))
Y_train = np.reshape(Y_train,(len(Y_train),1))
Y_test = np.reshape(Y_test,(len(Y_test),1))


(23833,) (23833, 441) (5959,) (5959, 441)


In [5]:
def r_square(y, y_pred):
    """ r^2 value defined by the competition host, r^2 = 1 indicates 100% prediction accuracy
    """
    avx = np.mean(y)
    avy = np.mean(y_pred)
    sum1, sumx, sumy = 0, 0, 0
    for i in range(len(y)):
        sum1 += (y[i] - avx)*(y_pred[i] - avy)
        sumx += (y[i] - avx)*(y[i] - avx)
        sumy += (y_pred[i] - avy)*(y_pred[i] - avy)
#     print(len(y), sum1, sumx, sumy)
    return sum1*sum1/(sumx*sumy)

In [35]:
MAPE = []
def mean_ape(y_true, y_pred):
    return np.mean(np.abs((y_true-y_pred)/y_true))*100

In [36]:
def mean_absolute_error(y_true,y_pred):
    return np.mean(np.abs((y_true-y_pred)))

In [6]:
from sklearn.ensemble import RandomForestRegressor
#clean up output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import cross_val_score
model = RandomForestRegressor(n_estimators=500, bootstrap = True, max_features = 'sqrt')

In [7]:
#custom R_2 score function
from sklearn.metrics import make_scorer
R_2 = make_scorer(r_square)
#clean up y
Y_train = np.ravel(Y_train)
print(Y_train)

[4.3003 5.6414 4.3003 ... 4.7636 4.3003 4.3003]


## Baseline Model

In [8]:
#cross validation
scores = cross_val_score(model,X_train, Y_train, cv = 7, scoring = R_2)
print(scores)
print('The average accuracy is (R2):', scores.mean())

[0.67269189 0.66346275 0.62196777 0.66579094 0.66779277 0.64608175
 0.65903882]
The average accuracy is: 0.656689525679745


In [38]:
#Mean absolute percentage error with 450 features selected with highest positive and negative correlation
mean_ape = make_scorer(mean_ape)
scores_mean_ape = cross_val_score(model, X_train, Y_train, cv = 7, scoring = mean_ape)
print(scores_mean_ape)
print('The average accuracy is:', scores_mean_ape.mean())

[5.47174133 5.53031001 5.45231223 5.47641435 5.53473753 5.5974416
 5.46300367]
The average accuracy is: 5.503708673136942


In [40]:
# MAE
mae = make_scorer(mean_absolute_error)
scores = cross_val_score(model,X_train, Y_train, cv = 7, scoring = mae)
print(scores)
print('The average accuracy is:', scores.mean())

[0.26936905 0.27272948 0.26394915 0.26983211 0.27428022 0.27690475
 0.26862467]
The average accuracy is: 0.270812775694426


In [9]:
print(model.get_params())

{'bootstrap': True, 'criterion': 'mse', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


## Tuning

### Specify the domain for hyperparameters

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
# parameters for GridSearchCV
# specify parameters and distributions to sample from
param_dist = {
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11)
             }

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
# run randomized search
n_iter_search = 150
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=n_iter_search)

In [25]:
print(X_train.shape)
print(Y_train.shape)

(23833, 441)
(23833,)


In [26]:
random_search.fit(X_train, Y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=150, n_jobs=None,
          param_distributions={'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f4ae311ce80>, 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f4ae2e44e80>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f4ae2d21128>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [32]:
from operator import itemgetter
# Utility function to report best scores
def report(results, n_top=10):
    f= open("Tuning_RF.txt","w+")
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            f.write("Model with rank: {0}".format(i))
            f.write("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            f.write("Parameters: {0}".format(results['params'][candidate]))
            f.write("\n")
    f.close()

In [33]:
from operator import itemgetter
# Utility function to report best scores
def printReport(results, n_top=10):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("\n")

In [34]:
printReport(random_search.cv_results_)
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.617 (std: 0.004)
Parameters: {'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}


Model with rank: 2
Mean validation score: 0.612 (std: 0.004)
Parameters: {'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 3}


Model with rank: 3
Mean validation score: 0.612 (std: 0.004)
Parameters: {'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 3}


Model with rank: 4
Mean validation score: 0.609 (std: 0.004)
Parameters: {'max_features': 9, 'min_samples_leaf': 1, 'min_samples_split': 5}


Model with rank: 5
Mean validation score: 0.603 (std: 0.003)
Parameters: {'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 6}


Model with rank: 6
Mean validation score: 0.603 (std: 0.004)
Parameters: {'max_features': 10, 'min_samples_leaf': 1, 'min_samples_split': 8}


Model with rank: 7
Mean validation score: 0.603 (std: 0.005)
Parameters: {'max_features': 8, 'min_samples_leaf': 1, 'min_samples_split': 6}


Mode

In [31]:
#cross validation
tu_model = RandomForestRegressor(n_estimators=500, max_features = 10, min_samples_leaf = 1, min_samples_split = 2)
tu_scores = cross_val_score(tu_model, X_train, Y_train, cv = 7, scoring = R_2)
print(tu_scores)
print('The average accuracy is:', tu_scores.mean())

[0.66438658 0.65268734 0.61591508 0.65834815 0.65922775 0.6395571
 0.65154725]
The average accuracy is: 0.6488098913034642


In [41]:
tu_scores = cross_val_score(tu_model, X_train, Y_train, cv = 7, scoring = mean_ape)
print(tu_scores)
print('The average accuracy is:', tu_scores.mean())

[5.57058769 5.65798305 5.54688902 5.59008767 5.62644915 5.68347886
 5.58485112]
The average accuracy is: 5.608618080593089


In [42]:
tu_scores = cross_val_score(tu_model, X_train, Y_train, cv = 7, scoring = mae)
print(tu_scores)
print('The average accuracy is:', tu_scores.mean())

[0.27512282 0.27905111 0.26966349 0.27522263 0.27955628 0.28288828
 0.27451251]
The average accuracy is: 0.27657387423978175
