In [11]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import time, random
import multiprocessing
from multiprocessing import Pool
from contextlib import closing
from functools import partial
from sklearn.metrics import precision_score,recall_score,accuracy_score,roc_curve,auc,mean_squared_error
from scipy.stats import mode
import argparse
from sklearn.externals import joblib
import matplotlib.pyplot as plt

In [15]:
def rf_regressor(feat):
    t=time.time()
    max_features,n_estimators,cf = feat
    X_train = pd.read_csv('X_train_after_classification.csv',sep=',',header=0).values
    y_train = pd.read_csv('y_train_after_classification.csv',sep=',',header=None).values
    y_train = np.log(1+y_train)
    X_pos = np.array([X_train[i,:] for i in range(X_train.shape[0]) if (y_train[i]>0)])
    y_pos = y_train[y_train>0]
    X_neg = np.array([X_train[i,:] for i in range(X_train.shape[0]) if (y_train[i]==0)])
    y_neg = y_train[y_train==0]
    over_samp_ct = y_neg.shape[0]
    _,d = X_pos.shape
    n_pos_train = X_pos.shape[0]
    n_neg_train = X_neg.shape[0]
    Over_under_X_train = np.zeros([over_samp_ct*2,d])
    Over_under_y_train = np.zeros([over_samp_ct*2,])
    for i in range(over_samp_ct):
        idx_pos = random.randint(0,n_pos_train-1)
        idx_neg = random.randint(0,n_neg_train-1)
        Over_under_X_train[2*i,:] = X_pos[idx_pos,:]
        Over_under_y_train[2*i] = y_pos[idx_pos]
        Over_under_X_train[2*i+1,:] = X_neg[idx_neg,:]
        Over_under_y_train[2*i+1] = y_neg[idx_neg]
    rf = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features).fit(Over_under_X_train,Over_under_y_train)
    X_pos = []
    y_pos = []
    X_neg = []
    y_neg = []
    Over_under_X_train = []
    Over_under_y_train = []
    X_valid = pd.read_csv('X_valid_after_classification.csv',sep=',',header=0).values
    y_valid = pd.read_csv('y_valid_after_classification.csv',sep=',',header=None).values
    y_valid = np.log(1+y_valid)
    train_pred = rf.score(X_train,y_train)
    valid_pred = rf.score(X_valid,y_valid)
    
    print(time.time()-t)
    
    y_pred_valid = rf.predict(X_valid)
    y_pred_valid[y_pred_valid<cf] = 0
    
    print(valid_pred,mean_squared_error(y_valid,y_pred_valid))
    joblib.dump(rf,'Random_forest_regressor_post_class_best.sav')
    plt.figure()
    plt.scatter(y_valid,y_pred_valid)
    plt.xlabel("True values")
    plt.ylabel("Predictions")
    plt.title("Prediction vs True, MSE: "+str(np.round(mean_squared_error(y_valid,y_pred_valid),2)))
    plt.savefig('Regression_rf_post_class.png')

In [16]:
def gbm_regressor(feat):
    t=time.time()
    max_features,max_depth,learning_rate,n_estimators,cf = feat
    X_train = pd.read_csv('X_train_after_classification.csv',sep=',',header=0).values
    y_train = pd.read_csv('y_train_after_classification.csv',sep=',',header=None).values
    y_train = np.log(1+y_train)
    X_pos = np.array([X_train[i,:] for i in range(X_train.shape[0]) if (y_train[i]>0)])
    y_pos = y_train[y_train>0]
    X_neg = np.array([X_train[i,:] for i in range(X_train.shape[0]) if (y_train[i]==0)])
    y_neg = y_train[y_train==0]
    over_samp_ct = y_neg.shape[0]
    _,d = X_pos.shape
    n_pos_train = X_pos.shape[0]
    n_neg_train = X_neg.shape[0]
    Over_under_X_train = np.zeros([over_samp_ct*2,d])
    Over_under_y_train = np.zeros([over_samp_ct*2,])
    for i in range(over_samp_ct):
        idx_pos = random.randint(0,n_pos_train-1)
        idx_neg = random.randint(0,n_neg_train-1)
        Over_under_X_train[2*i,:] = X_pos[idx_pos,:]
        Over_under_y_train[2*i] = y_pos[idx_pos]
        Over_under_X_train[2*i+1,:] = X_neg[idx_neg,:]
        Over_under_y_train[2*i+1] = y_neg[idx_neg]
    rf = GradientBoostingRegressor(n_estimators=n_estimators,max_features=max_features,max_depth=max_depth,learning_rate=learning_rate).fit(Over_under_X_train,Over_under_y_train)
    X_pos = []
    y_pos = []
    X_neg = []
    y_neg = []
    Over_under_X_train = []
    Over_under_y_train = []
    X_valid = pd.read_csv('X_valid_after_classification.csv',sep=',',header=0).values
    y_valid = pd.read_csv('y_valid_after_classification.csv',sep=',',header=None).values
    y_valid = np.log(1+y_valid)
    train_pred = rf.score(X_train,y_train)
    valid_pred = rf.score(X_valid,y_valid)
    
    print(time.time()-t)
    y_pred_train = rf.predict(X_train)
    y_pred_valid = rf.predict(X_valid)
    
    print(valid_pred,mean_squared_error(y_valid,y_pred_valid))
    joblib.dump(rf,'GBM_regressor_post_class_best.sav')
    plt.figure()
    plt.scatter(y_valid,y_pred_valid)
    plt.xlabel("True values")
    plt.ylabel("Predictions")
    plt.title("Prediction vs True, MSE: "+str(np.round(mean_squared_error(y_valid,y_pred_valid),2)))
    plt.savefig('Regression_gbm_post_class.png')

In [17]:
def which_class(ft):
    cls,*ft = ft
    if cls=='rf':
        rf_regressor(ft)
    elif cls=='gbm':
        gbm_regressor(ft)

In [18]:
ft = [['rf',40,600,0.3759],['gbm',39,9,0.01,900,2.6314]]
with closing(Pool(2)) as pool:
    pool.map(which_class,ft)

133.23869442939758
0.045430868202191 65.97746174676877
556.3898601531982
0.057861863378204426 65.11826414665323
