In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [2]:
def random_forest(x_train_filename, x_test_filename, y_train_filename, y_test_filename):

    # Load dataset
    df_X_train = pd.read_csv(x_train_filename, header=None)
    df_Y_train = pd.read_csv(y_train_filename, header=None)

    df_X_test = pd.read_csv(x_test_filename, header=None)
    df_Y_test = pd.read_csv(y_test_filename, header=None)

    X_train = np.array(df_X_train.values)
    Y_train = np.array(df_Y_train.values)

    X_test = np.array(df_X_test.values)
    Y_test = np.array(df_Y_test.values)
    
    # Create the model with some trees
    model = RandomForestClassifier(n_estimators=490, bootstrap = True, max_features = 'sqrt')
    
    # Fit on training data
    model.fit(X_train, Y_train)
    
    # Actual class predictions
    rf_predictions = model.predict(X_test)

    # Probabilities for each class
    rf_probs = model.predict_proba(X_test)[:, 1]

    #convert into binary values
    y_hat = list()
    for p in rf_probs:
        if p >= 0.5:       
            y_hat.append(1)
        else:  
            y_hat.append(0)
            
    # accuracy: (tp + tn) / (p + n)
    #accuracy = accuracy_score(Y_test, y_hat)
    #accuracy_str = 'Accuracy: %f' % accuracy 
    #print(accuracy_str)
    # precision tp / (tp + fp)
    #precision = precision_score(Y_test, y_hat, labels=range(5), average='weighted')
    #precision_str = 'Precision: %f' % precision 
    #print(precision_str)
    # recall: tp / (tp + fn)
    #recall = recall_score(Y_test, y_hat, labels=range(5), average='weighted')
    #recall_str = 'Recall: %f' % recall
    #print(recall_str)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(Y_test, y_hat, labels=range(5), average='weighted')
    f1_str = 'F1 score: %f' % f1 
    print(f1_str)
    # Calculate roc auc
    roc = roc_auc_score(Y_test, rf_probs)
    roc_str = 'ROC: %f' % roc
    print(roc_str)
    #Confusion matrix
    cm = confusion_matrix(Y_test, y_hat)
    print(cm)
    