In [None]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Import train & test data

In [None]:
b_train = pd.read_csv('data/b_train.csv')
print('b_train: OK: {}'.format(b_train.shape))
#b_train.head()
#b_train.describe()

In [None]:
b_test = pd.read_csv('data/b_test.csv')
print('b_test: OK: {}'.format(b_test.shape))
#b_test.head()
#b_test.describe()

## Prepare & split data for visualisation

In [None]:
b_train = b_train.dropna()
b_test = b_test.dropna()

# Features in order of importance:'total_bids','bids_per_auction','mean_time_diff', 'total_auctions', 'ip_entropy', 'url_entropy', 'min_response', 'mean_response'

all_features = ['total_bids','total_auctions','bids_per_auction',
                'mean_time_diff', 'mean_response', 'min_response',
                'ip_entropy', 'url_entropy']

features = ['total_bids', 'total_auctions', 'bids_per_auction', 'mean_time_diff', 'ip_entropy', 'url_entropy']
target = ['outcome']

X = np.array(b_train[features])
y = np.array(b_train[target]).ravel()
print('X.shape = {}\ny.shape = {}'.format(X.shape, y.shape))

X_submission = np.array(b_test[features])
print('X_submission.shape = {}'.format(X_submission.shape))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

y_test_transformed = np.hstack((1 - y_test.reshape(y_test.size,1),
                                y_test.reshape(y_test.size,1)))

---
# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

param = {
         'bootstrap': True,
         'class_weight': 'balanced'
        }

In [None]:
'''
auc_list = []
for i in range(1,10):
    RF_classifier = RandomForestClassifier(max_depth=i, random_state=0)
    RF_classifier.fit(X_train, y_train)
    auc_list.append(roc_auc_score(y_test_transformed, RF_classifier.predict_proba(X_test), average='weighted'))
best_max_depth = np.array(auc_list).argmax() + 1
print('best max_depth = {}'.format(best_max_depth)) # 2
print('roc-auc for it = {}'.format(np.array(auc_list).max()))
'''

### Cross validation (just testing)

In [None]:
CV_SSS = StratifiedShuffleSplit(n_splits = 5, test_size = 0.33, random_state=None)
#for train_indices, test_indices in CV_SSS.split(X, y):
    #print(train_indices, test_indices)
    
RF_classifier = RandomForestClassifier(max_depth=3, random_state=None)
RF_cross_val_scoring = cross_val_score(RF_classifier, X, y, scoring='roc_auc', cv=CV_SSS)
print(RF_cross_val_scoring)
print('mean: {:.4f}, max: {:.4f}, min: {:.4f}, std: {:.4f}'.format(RF_cross_val_scoring.mean(),
                                                               RF_cross_val_scoring.max(), 
                                                               RF_cross_val_scoring.min(),
                                                               RF_cross_val_scoring.std()))

### Hyperparameters

Choose the parameter max_depth

In [None]:
global_depth = []
for j in range(20):
    CV_SSS = StratifiedShuffleSplit(n_splits = 6, test_size = 0.33, random_state=None)
    auc_list = []
    for i in range(2,8):
        RF_classifier = RandomForestClassifier(max_depth=i, random_state=None)
        RF_cross_val_scoring = cross_val_score(RF_classifier, X, y, scoring='roc_auc', cv=CV_SSS)
        auc_list.append(RF_cross_val_scoring.mean())
    best_max_depth = np.array(auc_list).argmax() + 1
    #print('best max_depth = {}'.format(best_max_depth)) # 2
    #print('roc-auc for it = {}'.format(np.array(auc_list).max()))
    global_depth.append(best_max_depth)
global_depth, pd.DataFrame(global_depth).hist()

In [None]:
best_max_depth = pd.DataFrame(global_depth).mode()[0][0]
best_max_depth

In [None]:
'''
CV_SSS = StratifiedShuffleSplit(n_splits = 5, test_size = 0.33, random_state=42)
#for i in range(1,100):
RF_classifier = RandomForestClassifier(max_depth=best_max_depth, random_state=0)
RF_cross_val_scoring = cross_val_score(RF_classifier, X, y, scoring='roc_auc', cv=CV_SSS)
print(RF_cross_val_scoring)
print('mean: {:.4f}, max: {:.4f}, min: {:.4f}, std: {:.4f}'.format(RF_cross_val_scoring.mean(),
                                                               RF_cross_val_scoring.max(), 
                                                               RF_cross_val_scoring.min(),
                                                               RF_cross_val_scoring.std()))

global_aucs = []
for j in range(20):
    CV_SSS = StratifiedShuffleSplit(n_splits = 6, test_size = 0.33, random_state=None)
    auc_list = []
    for i in range(20):
        RF_classifier = RandomForestClassifier(max_depth=best_max_depth, random_state=i)
        RF_cross_val_scoring = cross_val_score(RF_classifier, X, y, scoring='roc_auc', cv=CV_SSS)
        auc_list.append(RF_cross_val_scoring.mean())
    #best_max_depth = np.array(auc_list).argmax() + 1
    global_aucs.append(auc_list)
#global_depth, pd.DataFrame(global_depth).hist()

best_random_state = np.array(global_aucs).mean(axis=0).argmax()
best_random_state
random_states = np.unravel_index(rs, np.array(global_aucs).shape)
'''

### Cross validation

In [None]:
CV_SSS = StratifiedShuffleSplit(n_splits = 5, test_size = 0.33, random_state=None)
#for train_indices, test_indices in CV_SSS.split(X, y):
    #print(train_indices, test_indices)
    
RF_classifier = RandomForestClassifier(max_depth=best_max_depth, random_state=None)
RF_cross_val_scoring = cross_val_score(RF_classifier, X, y, scoring='roc_auc', cv=CV_SSS)
print(RF_cross_val_scoring)
print('mean: {:.4f}, max: {:.4f}, min: {:.4f}, std: {:.4f}'.format(RF_cross_val_scoring.mean(),
                                                               RF_cross_val_scoring.max(), 
                                                               RF_cross_val_scoring.min(),
                                                               RF_cross_val_scoring.std()))

### ROC curve on test data (from train_test_split)

In [None]:
RF_classifier.fit(X_train, y_train)

print(RF_classifier.feature_importances_)

#y_pred = RF_classifier.predict(X_test)
#acc_score = accuracy_score(y_test, y_pred)
#print('accuracy = {}'.format(acc_score))

y_score = RF_classifier.predict_proba(X_test)
auc_score = roc_auc_score(y_test_transformed, y_score, average='weighted')
print('roc-auc = {}'.format(auc_score))

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(y_test_transformed[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area for 1
plt.figure()
lw = 2
plt.plot(fpr[1], tpr[1], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

### Predict on test data (to submission) 

In [None]:
RF_classifier.fit(X, y)
submission_prediction = RF_classifier.predict_proba(X_submission)[:,1]
print(submission_prediction)

In [None]:
b_test['prediction'] = submission_prediction
b_test.head()

In [None]:
b_test_RF = pd.read_csv('data/SubmissionRF2_3.csv')
b_test_RF.head()

In [None]:
for bidder in b_test_RF.bidder_id:
    if (bidder in list(b_test.bidder_id)):
        #print(bidder)
        #print(np.array(b_test[b_test.bidder_id==bidder].prediction)[0])
        b_test_RF.loc[b_test_RF[b_test_RF.bidder_id == bidder].index,
                      'prediction'] = np.array(b_test[b_test.bidder_id==bidder].prediction)[0]

In [None]:
b_test_RF.to_csv('data/SubmissionRF3_6.csv', sep=',', header=True, index=False)