In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
%matplotlib inline


### 1. Read Data ###

def read_data (filename, filetype):
    '''
    Read data from file to pandas DataFrame
        filename, filetype: string
    '''
    if filetype == 'csv':
        return pd.read_csv(filename, index_col=0)
    if filetype == 'xls':
        return pd.read_excel(filename)
    if filetype == 'json':
        return pd.read_json(filename)



### 2. Explore Data ###

def data_overview(df):
    print("_________ Data Types __________")
    print(df.dtypes)
    print("_________ Summary Statistics __________")
    print(df.describe().transpose())
    print("_________ Null Values _________")
    print(df.isnull().sum())
    print("_________ Correlation between Y and Xs ________")
    print(df.corr()['SeriousDlqin2yrs'])
    print("_________ Correlation Matrix _________")
    plot_correlations(df)
    

def plot_correlations(df):
    '''
    Plot correlation matrix
    '''
    corr = df.corr()
    sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
    
# removing outliers before making graphs    
def make_graph(df, col_name):
    '''
    Generate a simple graph for the desired column variable
        df: pandas DataFrame
        col_name: desired variable
    '''
    df = df[np.abs(df[col_name]-df[col_name].mean())<=(3*df[col_name].std())]
    print('Plotting ' + col_name)
    df[col_name].hist()
    plt.title(col_name)
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.show()



### 3. Pre-Process Data ###

def convert_col_type(df, col_name, to_type):
    '''
    Convert column types to desired ones
        to_type: string
    '''
    if to_type == 'int':
        df[col] = df[col].astype(int)
    elif to_type == 'float':
        df[col] = df[col].astype(float)
    elif to_type == 'string':
        df[col] = df[col].astype(str)
    elif to_type == 'boolean':
        df[col] = df[col].astype(np.bool)



def fill_null(df, cols, fill_method):
    '''
    Fill null values of the specified columnn in the dataframe
        cols: list of column name(s), string
        fill_method: 'mean' for float, 'median' for integer, 'mode' for string, 
                     'zero', 'drop' for special case         
    '''
    if fill_method == 'mean':
        for col in cols:
            df[col].fillna(value=df[col].mean(), inplace=True)  
    elif fill_method == 'median':
        for col in cols:
            df[col].fillna(value=df[col].median(), inplace=True)
    elif fill_method == 'mode':
        for col in cols:
            df[col].fillna(value=df[col].mode(), inplace=True)
    elif fill_method == 'zero':
        for col in cols:
            df[col].fillna(value=0, inplace=True)
    elif fill_method == 'drop':
        df.dropna(subset=cols, inplace=True)
    return df



### 4. Generate Features/ Predictors ###

def discretize_cont_var(df, col_name, num_bins, cut_type, labels):
    '''
    Discretize a continuous variable of the DataFrame
        df: pandas DataFrame
        col_name, cut_type: string
        nnum_bins: integer
        labels: list of strings
    '''
    if cut_type == 'quantile':
        df[col_name +'_discretize'] = pd.qcut(df[col_name], num_bins, labels=labels)
    if cut_type == 'uniform':
        df[col_name +'_discretize'] = pd.cut(df[col_name], num_bins, labels=labels)
    return df       


def binarize_categ_var(df, col_name):
    '''
    Take a categorical variable and create binary/dummy variables from it
        df: pandas DataFrame
        col_name: string, categorical variable to binarize
    '''
    dummies = pd.get_dummies(df[col_name])
    df = df.join(dummies)
    return df



### 5. Build Classifier ###

def split_data(df, X, y, test_size):
    '''
    Split data into training and test sets
        df: Pandas DataFrame
        X: array of string, features
        y: string, outcome variable
        test_size: proportion of the total data used as test dataset

    '''
    X_train, X_test, y_train, y_test  = train_test_split(df[X], df[y], test_size=test_size)
    return X_train, X_test, y_train, y_test    



def test_model(X_train, y_train, features, method):
    '''
    Build classifiers chosen by the user
        X_train, y_train: Pandas DataFrame
        features: list of strings, variables we care about
        method: LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(),\
        RandomForestClassifier(), GradientBoostingClassifier()
    '''
    X = X_train[features]
    y = y_train
    return method.fit(X, y)



def predict_model(X_train, y_train, X_test, y, features, method):
    '''
    Predict outcomes for test data based on the chosen classifier, and write to csv file
        X_test: test pandas DataFrame
        y: string, outcome variable name, 'SeriousDlqin2yrs'
        features: list of strings, variables we care about
        method: LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(),\
        RandomForestClassifier(), GradientBoostingClassifier()
    '''
    method.fit(X_train[features], y_train)
    X = X_test[features]
    y_pred = method.predict(X)
    df_pred = X_test
    df_pred[y] = y_pred
    df_pred.to_csv('predictions'+ str(method) + '.csv', header=True)
    


### 6. Evaluate Classifier ###

def eval_model(X_train, y_train, X_test, y_test, features, method):
    method.fit(X_train[features], y_train)
    X = X_test[features]
    y_pred = method.predict(X)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    print('Evaluation for ' + str(method))
    print('Accuracy score is: {}'.format(accuracy))
    print('Recall score is: {}'.format(recall))
    print('Precision score is: {}'.format(precision))
    print('________#############_______')


    


In [5]:
### code is modified from https://github.com/rayidghani/magicloops/blob/master/magicloops.py


from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing, cross_validation, svm, metrics, tree, decomposition
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier, OrthogonalMatchingPursuit, RandomizedLogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.grid_search import ParameterGrid, GridSearchCV
from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import random
import pylab as pl
import matplotlib.pyplot as plt
from scipy import optimize
import time
import seaborn as sns
import sys

def define_clfs_params(grid_size):

    clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
        'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
        'LR': LogisticRegression(penalty='l1', C=1e5),
        'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
        'DT': DecisionTreeClassifier(),
        'KNN': KNeighborsClassifier(n_neighbors=3),
        'BAG': BaggingClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=5, max_samples=0.6, max_features=1)
            }

    large_grid = { 
    'RF':{'n_estimators': [1,10,100,1000,10000], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
    'BAG': {'n_estimators': [5,10,20], 'max_samples':[0.4,0.5,0.6]}
           }
    
    small_grid = {
    'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10]},
    'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
    'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
    'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
    'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']},
    'BAG': {'n_estimators': [5,10], 'max_samples':[0.4,0.6]}
           }
    
    test_grid = { 
    'RF':{'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'LR': { 'penalty': ['l1'], 'C': [0.01]},
    'AB': { 'algorithm': ['SAMME'], 'n_estimators': [1]},
    'DT': {'criterion': ['gini'], 'max_depth': [1], 'max_features': ['sqrt'],'min_samples_split': [10]},
    'SVM' :{'C' :[0.01],'kernel':['linear']},
    'KNN' :{'n_neighbors': [5],'weights': ['uniform'],'algorithm': ['auto']},
    'BAG':{'n_estimators': [5]}
           }
    
    if grid_size == 'large':
        return clfs, large_grid
    elif grid_size == 'small':
        return clfs, small_grid
    elif grid_size == 'test':
        return clfs, test_grid
    else:
        return 0, 0

        
        
def generate_binary_at_k(y_pred_probs, k):
    '''
    Turn probability estimates into binary at level k
    '''

    cutoff_index = int(len(y_pred_probs) * (k / 100.0))
    y_pred_binary = [1 if x < cutoff_index else 0 for x in range(len(y_pred_probs))]
    return y_pred_binary

def precision_at_k(y_true, y_pred_probs, k):
    '''
    Calculate precision score for probability estimates at level k
    '''

    preds_at_k = generate_binary_at_k(y_pred_probs, k)
    precision = precision_score(y_true, preds_at_k)
    return precision

def recall_at_k(y_true, y_pred_probs, k):
    '''
    Calculate recall score for probability estimates at level k
    '''

    preds_at_k = generate_binary_at_k(y_pred_probs, k)
    recall = recall_score(y_true, preds_at_k)
    return recall

def plot_precision_recall_n(y_true, y_pred_probs, model_name):
    '''
    Plot precision recall curve
    '''

    from sklearn.metrics import precision_recall_curve
    y_score = y_pred_probs
    precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)
    precision_curve = precision_curve[:-1]
    recall_curve = recall_curve[:-1]
    pct_above_per_thresh = []
    number_scored = len(y_score)
    plt.figure()
    for value in pr_thresholds:
        num_above_thresh = len(y_score[y_score>=value])
        pct_above_thresh = num_above_thresh / float(number_scored)
        pct_above_per_thresh.append(pct_above_thresh)
    pct_above_per_thresh = np.array(pct_above_per_thresh)
    plt.clf()
    fig, ax1 = plt.subplots()
    ax1.plot(pct_above_per_thresh, precision_curve, 'b')
    ax1.set_xlabel('percent of population')
    ax1.set_ylabel('precision', color='b')
    ax2 = ax1.twinx()
    ax2.plot(pct_above_per_thresh, recall_curve, 'r')
    ax2.set_ylabel('recall', color='r')

    plt.title(model_name)
    plt.savefig('Graphs from evaluation/'+model_name)
    plt.close()
        
        
def clf_loop(models_to_run, clfs, grid, X, y, test_size, output=True):
   '''
    Run over certain clfs and grid, and evaluate each classifier by certain metrics
    Inputs:
        models_to_run: list of strings, classifiers to test
        clfs: dictionary of classifiers 
        grid: dictionary of parameters grid 
        X: pandas DataFrame, features
        y: pandas DataFrame
        test_size: float [0.0, 1.0], proportion of the total data used as test dataset
        output: boolean, save graphs and save result dataframe to csv file
    Output:
        pandas DataFrame, including classifiers, parameters, runtime, and evaluation scores
    '''

    results_df =  pd.DataFrame(columns=('model_type','clf', 'parameters', 'train_time', 'test_time',
                                        'accuracy','f1_score', 'precision', 'recall', 'auc', 
                                        'p_at_5', 'p_at_10', 'p_at_20',
                                        'r_at_5', 'r_at_10', 'r_at_20'))
    for n in range(1, 2):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=0)
        for index,clf in enumerate([clfs[x] for x in models_to_run]):
            print (models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    train_start = time.time()
                    clf.fit(X_train, y_train)
                    train_end = time.time()
                    train_time = train_end - train_start

                    test_start = time.time()
                    y_pred = clf.predict(X_test)
                    test_end = time.time()
                    test_time = test_end - test_start

                    y_pred_probs = clf.predict_proba(X_test)[:,1]
                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
                    results_df.loc[len(results_df)] = [models_to_run[index],clf, p, train_time, test_time,
                                                       accuracy_score(y_test, y_pred),
                                                       f1_score(y_test, y_pred),
                                                       precision_score(y_test, y_pred),
                                                       recall_score(y_test, y_pred),
                                                       roc_auc_score(y_test, y_pred),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,5.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,10.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,20.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted,5.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted,10.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted,20.0)]

                    if output:
                        model_name = models_to_run[index] + str(len(results_df))
                        plot_precision_recall_n(y_test, y_pred_probs,model_name)
                except IndexError as e:
                    print ('Error:',e)
                    continue
    if output:
        results_df.to_csv('clf_evaluations.csv')
    return results_df




In [6]:
X = np.array(['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'MonthlyIncome', 'NumberOfTimes90DaysLate'])
y = 'SeriousDlqin2yrs'
models_to_run=['RF','LR','DT','KNN','AB','BAG']
grid_size = 'test'

df = read_data('credit-data.csv', 'csv')
df['SeriousDlqin2yrs'] = df['SeriousDlqin2yrs'].astype(np.bool)
fill_null(df, ['MonthlyIncome'], 'mean')
clfs, grid = define_clfs_params(grid_size)
clf_loop(models_to_run, clfs, grid, df[X], df[y], 0.25)

RF


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


LR
DT


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KNN
AB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


BAG


Unnamed: 0,model_type,clf,parameters,train_time,test_time,accuracy,f1_score,precision,recall,auc,p_at_5,p_at_10,p_at_20,r_at_5,r_at_10,r_at_20
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 1, 'max_features': 'sqrt', 'min_...",0.152596,0.004184,0.93208,0.0,0.0,0.0,0.5,1.0,0.529067,0.264533,0.73616,0.778956,0.778956
1,LR,"LogisticRegression(C=0.01, class_weight=None, ...","{'C': 0.01, 'penalty': 'l1'}",0.418475,0.003895,0.9324,0.028363,0.596774,0.014527,0.506906,0.1392,0.1296,0.116933,0.102473,0.190813,0.344327
2,DT,"DecisionTreeClassifier(class_weight=None, crit...","{'criterion': 'gini', 'max_depth': 1, 'max_fea...",0.037945,0.001697,0.93208,0.0,0.0,0.0,0.5,0.464,0.672533,0.3396,0.341578,0.990185,1.0
3,KNN,"KNeighborsClassifier(algorithm='auto', leaf_si...","{'algorithm': 'auto', 'n_neighbors': 5, 'weigh...",0.19189,2.289668,0.930107,0.051393,0.328704,0.027876,0.511864,0.397867,0.2536,0.1268,0.292894,0.37338,0.37338
4,AB,"(DecisionTreeClassifier(class_weight=None, cri...","{'algorithm': 'SAMME', 'n_estimators': 1}",0.082943,0.003267,0.93208,0.0,0.0,0.0,0.5,0.464,0.672533,0.3396,0.341578,0.990185,1.0
5,BAG,"(DecisionTreeClassifier(class_weight=None, cri...",{'n_estimators': 5},0.720283,0.02201,0.932053,0.001567,0.4,0.000785,0.50035,0.263467,0.2376,0.195733,0.193954,0.349823,0.576364


<matplotlib.figure.Figure at 0x101eec6a0>

<matplotlib.figure.Figure at 0x10e7d2cc0>

<matplotlib.figure.Figure at 0x108a1d320>

<matplotlib.figure.Figure at 0x10e5ebcf8>

<matplotlib.figure.Figure at 0x10e625cf8>

<matplotlib.figure.Figure at 0x10ee53e80>

In [7]:
X = np.array(['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'age', 'MonthlyIncome', 'NumberOfTimes90DaysLate'])
y = 'SeriousDlqin2yrs'
models_to_run=['RF','LR','DT','KNN','AB','BAG']
grid_size = 'small'

df = read_data('credit-data.csv', 'csv')
df['SeriousDlqin2yrs'] = df['SeriousDlqin2yrs'].astype(np.bool)
fill_null(df, ['MonthlyIncome'], 'mean')
clfs, grid = define_clfs_params(grid_size)
clf_loop(models_to_run, clfs, grid, df[X], df[y], 0.25)

RF
LR


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


DT


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

KNN




AB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


BAG


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,model_type,clf,parameters,train_time,test_time,accuracy,f1_score,precision,recall,auc,p_at_5,p_at_10,p_at_20,r_at_5,r_at_10,r_at_20
0,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'sqrt', 'min_...",0.672108,0.107386,0.934187,0.193464,0.576998,0.116215,0.555003,0.442133,0.326933,0.221067,0.325481,0.481351,0.650962
1,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'sqrt', 'min_...",5.291078,0.227809,0.934560,0.174293,0.609412,0.101688,0.548470,0.442667,0.324800,0.222667,0.325874,0.478210,0.655673
2,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'sqrt', 'min_...",0.534235,0.107933,0.934560,0.203764,0.586916,0.123282,0.558480,0.446933,0.320800,0.221200,0.329015,0.472320,0.651355
3,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'sqrt', 'min_...",5.182057,0.224994,0.934453,0.179573,0.599109,0.105614,0.550232,0.440533,0.323733,0.223333,0.324303,0.476639,0.657636
4,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'log2', 'min_...",0.651213,0.108530,0.932960,0.071640,0.602484,0.038084,0.518126,0.441067,0.324000,0.221333,0.324696,0.477032,0.651747
5,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'log2', 'min_...",5.270168,0.325451,0.934587,0.176016,0.609302,0.102866,0.549030,0.444267,0.326400,0.222800,0.327051,0.480565,0.656066
6,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'log2', 'min_...",0.646718,0.110097,0.933840,0.167170,0.576389,0.097762,0.546263,0.445333,0.320533,0.222400,0.327837,0.471928,0.654888
7,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 5, 'max_features': 'log2', 'min_...",5.179630,0.231905,0.934773,0.155387,0.644699,0.088339,0.542396,0.444267,0.323733,0.223067,0.327051,0.476639,0.656851
8,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 50, 'max_features': 'sqrt', 'min...",1.584396,0.107834,0.930080,0.216378,0.453066,0.142128,0.564813,0.387200,0.338933,0.228400,0.285041,0.499018,0.672556
9,RF,"(DecisionTreeClassifier(class_weight=None, cri...","{'max_depth': 50, 'max_features': 'sqrt', 'min...",14.235851,0.536350,0.932293,0.244570,0.504914,0.161366,0.574918,0.411200,0.300800,0.204800,0.302709,0.442874,0.603062


<matplotlib.figure.Figure at 0x10eba8b38>

<matplotlib.figure.Figure at 0x10fd21278>

<matplotlib.figure.Figure at 0x101f01f98>

<matplotlib.figure.Figure at 0x10e7f30f0>

<matplotlib.figure.Figure at 0x10ec087b8>

<matplotlib.figure.Figure at 0x10e54e4a8>

<matplotlib.figure.Figure at 0x10fd231d0>

<matplotlib.figure.Figure at 0x10ee15e80>

<matplotlib.figure.Figure at 0x10f22cd68>

<matplotlib.figure.Figure at 0x10ede8cc0>

<matplotlib.figure.Figure at 0x101baaba8>

<matplotlib.figure.Figure at 0x1107d1e80>

<matplotlib.figure.Figure at 0x1120ba908>

<matplotlib.figure.Figure at 0x1126185f8>

<matplotlib.figure.Figure at 0x1173bed68>

<matplotlib.figure.Figure at 0x1109cc080>

<matplotlib.figure.Figure at 0x112236208>

<matplotlib.figure.Figure at 0x112286b38>

<matplotlib.figure.Figure at 0x1158ec668>

<matplotlib.figure.Figure at 0x1198f8eb8>

<matplotlib.figure.Figure at 0x11278d470>

<matplotlib.figure.Figure at 0x119a2e7f0>

<matplotlib.figure.Figure at 0x1180e38d0>

<matplotlib.figure.Figure at 0x119f9e320>

<matplotlib.figure.Figure at 0x11ac33f28>

<matplotlib.figure.Figure at 0x11a6bf978>

<matplotlib.figure.Figure at 0x11ac9ad30>

<matplotlib.figure.Figure at 0x11c699c88>

<matplotlib.figure.Figure at 0x11f9eb630>

<matplotlib.figure.Figure at 0x11c7815f8>

<matplotlib.figure.Figure at 0x11c05b128>

<matplotlib.figure.Figure at 0x11c781940>

<matplotlib.figure.Figure at 0x11b8964e0>

<matplotlib.figure.Figure at 0x11c5811d0>

<matplotlib.figure.Figure at 0x11bbd5b00>

<matplotlib.figure.Figure at 0x11fb36c88>

<matplotlib.figure.Figure at 0x1218deac8>

<matplotlib.figure.Figure at 0x1212f1ba8>

<matplotlib.figure.Figure at 0x11be69160>

<matplotlib.figure.Figure at 0x11be8b2b0>

<matplotlib.figure.Figure at 0x1234d19e8>

<matplotlib.figure.Figure at 0x12398f828>

<matplotlib.figure.Figure at 0x123522a90>

<matplotlib.figure.Figure at 0x11d71f9e8>

<matplotlib.figure.Figure at 0x1244282e8>

<matplotlib.figure.Figure at 0x1235c2780>

<matplotlib.figure.Figure at 0x1236dc828>

<matplotlib.figure.Figure at 0x12441e160>

<matplotlib.figure.Figure at 0x122cc9358>

<matplotlib.figure.Figure at 0x122d69a90>

<matplotlib.figure.Figure at 0x122d79f98>

<matplotlib.figure.Figure at 0x123089ba8>

<matplotlib.figure.Figure at 0x124953a58>

<matplotlib.figure.Figure at 0x12498c5f8>

<matplotlib.figure.Figure at 0x12580e518>

<matplotlib.figure.Figure at 0x12519c9b0>

<matplotlib.figure.Figure at 0x125d3ff28>

<matplotlib.figure.Figure at 0x124149898>

<matplotlib.figure.Figure at 0x125adaa58>

<matplotlib.figure.Figure at 0x12540f5f8>

<matplotlib.figure.Figure at 0x12638acf8>

<matplotlib.figure.Figure at 0x12639cc18>

<matplotlib.figure.Figure at 0x126491a20>

<matplotlib.figure.Figure at 0x126c4de80>

<matplotlib.figure.Figure at 0x124ffdc88>

<matplotlib.figure.Figure at 0x1269f7748>

<matplotlib.figure.Figure at 0x126afe630>

<matplotlib.figure.Figure at 0x1272eeba8>

<matplotlib.figure.Figure at 0x127341518>

<matplotlib.figure.Figure at 0x126a48d30>

<matplotlib.figure.Figure at 0x1274f4ac8>

<matplotlib.figure.Figure at 0x1270df7b8>

<matplotlib.figure.Figure at 0x1270d4ba8>

<matplotlib.figure.Figure at 0x1283d4da0>

<matplotlib.figure.Figure at 0x124c102e8>

<matplotlib.figure.Figure at 0x1283e5710>

<matplotlib.figure.Figure at 0x12808f828>

<matplotlib.figure.Figure at 0x1271029b0>

<matplotlib.figure.Figure at 0x128476550>

<matplotlib.figure.Figure at 0x128e51f28>

<matplotlib.figure.Figure at 0x128e20c18>

<matplotlib.figure.Figure at 0x128c62cf8>

<matplotlib.figure.Figure at 0x127bbce10>

<matplotlib.figure.Figure at 0x128e9bbe0>

<matplotlib.figure.Figure at 0x126198320>

<matplotlib.figure.Figure at 0x128ac2898>

<matplotlib.figure.Figure at 0x126c87c18>

<matplotlib.figure.Figure at 0x127a37208>

<matplotlib.figure.Figure at 0x1261c8710>

<matplotlib.figure.Figure at 0x127a4b908>

<matplotlib.figure.Figure at 0x1297de2e8>

<matplotlib.figure.Figure at 0x127a8ae10>

<matplotlib.figure.Figure at 0x127a8ad30>

<matplotlib.figure.Figure at 0x124dbf630>

<matplotlib.figure.Figure at 0x12a6da780>

<matplotlib.figure.Figure at 0x12a863898>

<matplotlib.figure.Figure at 0x12a859208>

<matplotlib.figure.Figure at 0x12b49c630>

<matplotlib.figure.Figure at 0x12af33518>

<matplotlib.figure.Figure at 0x128eec358>

<matplotlib.figure.Figure at 0x12a762cc0>

<matplotlib.figure.Figure at 0x128ed8ac8>

<matplotlib.figure.Figure at 0x12a264f60>

<matplotlib.figure.Figure at 0x12ce68908>

<matplotlib.figure.Figure at 0x12b4f1748>

<matplotlib.figure.Figure at 0x12c0e8208>

<matplotlib.figure.Figure at 0x1299e0a58>

<matplotlib.figure.Figure at 0x12cda1cc0>

<matplotlib.figure.Figure at 0x129e04b38>

<matplotlib.figure.Figure at 0x12be23048>

<matplotlib.figure.Figure at 0x12d0c0278>

<matplotlib.figure.Figure at 0x12be914a8>

<matplotlib.figure.Figure at 0x12d5eb048>

<matplotlib.figure.Figure at 0x12bca4438>

<matplotlib.figure.Figure at 0x12c994860>

<matplotlib.figure.Figure at 0x12d26db00>

<matplotlib.figure.Figure at 0x12d7374e0>

<matplotlib.figure.Figure at 0x12dd61c88>

<matplotlib.figure.Figure at 0x12d630a20>

<matplotlib.figure.Figure at 0x12f145470>

<matplotlib.figure.Figure at 0x12dda6438>

<matplotlib.figure.Figure at 0x12ee4def0>

<matplotlib.figure.Figure at 0x133625400>

<matplotlib.figure.Figure at 0x12ee66a90>

<matplotlib.figure.Figure at 0x12df76588>

<matplotlib.figure.Figure at 0x12c93ffd0>

<matplotlib.figure.Figure at 0x12ee9cf60>

<matplotlib.figure.Figure at 0x10e5a8cf8>

<matplotlib.figure.Figure at 0x12f440390>

<matplotlib.figure.Figure at 0x12df9fd30>

<matplotlib.figure.Figure at 0x12f62ef60>

<matplotlib.figure.Figure at 0x12f378a90>

<matplotlib.figure.Figure at 0x1304d9da0>

<matplotlib.figure.Figure at 0x12f693e48>

<matplotlib.figure.Figure at 0x1304eca58>

<matplotlib.figure.Figure at 0x1303dc9b0>

<matplotlib.figure.Figure at 0x12e60c7b8>

<matplotlib.figure.Figure at 0x12e604f98>

<matplotlib.figure.Figure at 0x1329d70f0>

<matplotlib.figure.Figure at 0x12fcb7f28>

<matplotlib.figure.Figure at 0x130df2cf8>

<matplotlib.figure.Figure at 0x13334add8>

<matplotlib.figure.Figure at 0x1329caf28>

<matplotlib.figure.Figure at 0x1335c16a0>

<matplotlib.figure.Figure at 0x13496b588>

<matplotlib.figure.Figure at 0x132d957f0>

<matplotlib.figure.Figure at 0x134851ef0>

<matplotlib.figure.Figure at 0x134174908>

In [14]:
result = pd.read_csv('clf_evaluations.csv', index_col=0)
METRICS = ['auc', 'f1_score','precision','recall','accuracy','train_time']

In [15]:
def select_best(result, metric):
    best_models_by_metric = result.sort(metric, ascending=False).head(10)
    return best_models_by_metric

In [22]:
for metric in METRICS:
    print(select_best(result, metric)[['model_type', metric]])


   model_type       auc
96         DT  0.590677
51         DT  0.589102
56         DT  0.588999
53         DT  0.588304
92         DT  0.588154
87         DT  0.587312
54         DT  0.587006
91         DT  0.586968
93         DT  0.586758
50         DT  0.586452
    model_type  f1_score
143         AB  0.255724
76          DT  0.255281
14          RF  0.253940
141         AB  0.253892
142         AB  0.253825
137         AB  0.253487
138         AB  0.251664
42          DT  0.250815
46          DT  0.249744
136         AB  0.249006
    model_type  precision
145        BAG   1.000000
128        KNN   0.695652
104        KNN   0.695652
116        KNN   0.680000
106        KNN   0.666667
130        KNN   0.666667
118        KNN   0.666667
32          DT   0.666667
7           RF   0.644699
114        KNN   0.634615
   model_type    recall
56         DT  0.233216
53         DT  0.231252
92         DT  0.230467
50         DT  0.230467
95         DT  0.226148
89         DT  0.225756
96     

  from ipykernel import kernelapp as app
