In [1]:
import numpy as np
import pandas as pd
from time import time
from functools import reduce
from collections import deque
from sklearn import tree
from metrix import accuracy, confusion_matrix
import scipy.stats as st

from DecisionTreeClassifier import DecisionTreeClassifier
from RandomForestClassifier import RandomForestClassifier

In [2]:
def train_test_split(X, ratio=0.7):
    X = X.sample(frac=1).reset_index(drop=True)
    return X[:int(len(X) * ratio)], X[int(len(X) * ratio):]

# Helper functions for Grid Search

def cartesian_product(arr1, arr2):
    product = []
    for i in arr1:
        for j in arr2:
            if isinstance(i, tuple):
                # (1, 2) and 5 -> (1, 2, 5)
                product.append((*i,j))
            else:
                # 1 and 2 -> (1, 2)
                product.append((i,j))
    return product

def all_possible_param_combinations(params):
    return reduce(cartesian_product, map(lambda param_name: params[param_name], params))

In [3]:
def grid_search_decision_tree(params_to_optimize, params, X_train, y_train, X_test, y_test):
    all_possibilities = all_possible_param_combinations(params_to_optimize)
    best_accuracy = 0
    best_model = None
    for index, possibility in enumerate(all_possibilities):
        model = DecisionTreeClassifier(
            min_members=possibility[0], 
            criterion=params[1],
            tol=params[0],
            max_depth=possibility[1], 
            split_method=params[2],
            max_features=possibility[2]
        )
        a=time()
        model.fit(X_train, y_train)
        b=time()
        print('model', index + 1)
        print('trained in', b-a, 'seconds')
        accuracy = model.score(X_test, y_test)
        print('accuracy: ', accuracy)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
    
    return best_accuracy, best_model

In [4]:
def grid_search_random_forest(params_to_optimize, params, X_train, y_train, X_test, y_test):
    all_possibilities = all_possible_param_combinations(params_to_optimize)
    best_accuracy = 0
    best_model = None
    for index, possibility in enumerate(all_possibilities):
        model = RandomForestClassifier(
            n_trees=possibility[3],
            min_members=possibility[0], 
            criterion=params[1],
            tol=params[0],
            max_depth=possibility[1], 
            split_method=params[2],
            max_features=possibility[2]
        )
        a=time()
        model.fit(X_train, y_train)
        b=time()
        print('model', index + 1)
        print('trained in', b-a, 'seconds')
        accuracy = model.score(X_test, y_test)
        print('accuracy: ', accuracy)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
    return best_accuracy, best_model

In [5]:
df = pd.read_csv('weatherAUS.csv')

In [6]:
df.dropna(inplace=True)
df.drop('Date', axis=1, inplace=True)
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
5939,Cobar,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,SW,...,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No,0.0,No
5940,Cobar,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,SSE,...,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No,0.0,No
5942,Cobar,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,NNW,...,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No,0.0,No
5943,Cobar,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,WSW,...,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No,0.0,No
5944,Cobar,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,WNW,...,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No,0.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139108,Darwin,19.3,33.4,0.0,6.0,11.0,ENE,35.0,SE,NE,...,32.0,1013.9,1010.5,0.0,1.0,24.5,32.3,No,0.0,No
139109,Darwin,21.2,32.6,0.0,7.6,8.6,E,37.0,SE,SE,...,28.0,1014.6,1011.2,7.0,0.0,24.8,32.0,No,0.0,No
139110,Darwin,20.7,32.8,0.0,5.6,11.0,E,33.0,E,W,...,23.0,1015.3,1011.8,0.0,0.0,24.8,32.1,No,0.0,No
139111,Darwin,19.5,31.8,0.0,6.2,10.6,ESE,26.0,SE,NNW,...,58.0,1014.9,1010.7,1.0,1.0,24.8,29.2,No,0.0,No


In [7]:
for feature in ['Location', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']:
    df[feature] = df[feature].astype('category')
    df[feature] = df[feature].cat.codes

In [39]:
corr = df.corr()

In [40]:
df = df.loc[:, (corr.iloc[-1] < -0.2) | (corr.iloc[-1] > 0.2)]
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [41]:
train_data, test_data = train_test_split(df)

In [42]:
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

In [43]:
params_to_optimize = {
    'min_members': [5, 10, 15, 20, 30, 50],
    'max_depth': [2, 3, 4, 5],
    'max_features': [1, 2, 3]
}

In [44]:
best_accuracy, best_model = grid_search_decision_tree(params_to_optimize, (None, 'gini', 'binary'), X_train, y_train, X_test, y_test)

model 1
trained in 0.025403499603271484 seconds
accuracy:  0.824390243902439
model 2
trained in 0.0293121337890625 seconds
accuracy:  0.926829268292683
model 3
trained in 0.034722089767456055 seconds
accuracy:  0.9024390243902439
model 4
trained in 0.02681875228881836 seconds
accuracy:  0.9219512195121952
model 5
trained in 0.03939223289489746 seconds
accuracy:  0.9317073170731708
model 6
trained in 0.05873394012451172 seconds
accuracy:  0.9219512195121952
model 7
trained in 0.032219648361206055 seconds
accuracy:  0.9121951219512195
model 8
trained in 0.045658111572265625 seconds
accuracy:  0.8097560975609757
model 9
trained in 0.04843616485595703 seconds
accuracy:  0.9121951219512195
model 10
trained in 0.02725076675415039 seconds
accuracy:  0.9024390243902439
model 11
trained in 0.03738045692443848 seconds
accuracy:  0.926829268292683
model 12
trained in 0.04443240165710449 seconds
accuracy:  0.9121951219512195
model 13
trained in 0.016505002975463867 seconds
accuracy:  0.87804878048

In [46]:
best_accuracy

0.9463414634146341

In [45]:
best_model.score(X_train, y_train)

0.9623430962343096

In [15]:
pred = best_model.predict(X_test)
confusion_matrix(y_test, pred)

Unnamed: 0,Actual Positive,Actual Negative
Predicted Positive,3735,0
Predicted Negative,0,13191


In [None]:
params_to_optimize = {
    'min_members': [20, 50],
    'max_depth': [3],
    'max_features': [1, 2],
    'n_trees': [50, 100]
}

In [None]:
best_accuracy, best_model = grid_search_random_forest(params_to_optimize, (0.1, 'gini', 'binary'), X_train, y_train, X_test, y_test)

In [None]:
best_model.score(X_train, y_train)

In [None]:
pred_rf = best_model.predict(X_test)
confusion_matrix(y_test, pred_rf)

In [16]:
df = pd.read_csv('breast-cancer-wisconsin.data', header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [17]:
df = df.loc[(df != '?').all(axis=1), :]

  res_values = method(rvalues)


In [18]:
df = df.astype('int')

In [19]:
train_data, test_data = train_test_split(df.iloc[:, 1:])

In [20]:
X_train = train_data.iloc[:, :-1]
y_train = train_data.iloc[:, -1]

X_test = test_data.iloc[:, :-1]
y_test = test_data.iloc[:, -1]

In [21]:
params_to_optimize = {
    'min_members': [10, 20, 50],
    'max_depth': [2, 3, 4],
    'max_features': [1, 2]
}

In [28]:
best_accuracy, best_model = grid_search_decision_tree(params_to_optimize, (None, 'gini', 'binary'), X_train, y_train, X_test, y_test)

model 1
trained in 0.02569437026977539 seconds
accuracy:  0.9170731707317074
model 2
trained in 0.022536039352416992 seconds
accuracy:  0.9170731707317074
model 3
trained in 0.017319440841674805 seconds
accuracy:  0.8926829268292683
model 4
trained in 0.01638507843017578 seconds
accuracy:  0.8390243902439024
model 5
trained in 0.025835752487182617 seconds
accuracy:  0.8926829268292683
model 6
trained in 0.025656938552856445 seconds
accuracy:  0.8926829268292683
model 7
trained in 0.03014349937438965 seconds
accuracy:  0.9073170731707317
model 8
trained in 0.0338282585144043 seconds
accuracy:  0.9073170731707317
model 9
trained in 0.021901845932006836 seconds
accuracy:  0.9073170731707317
model 10
trained in 0.021515369415283203 seconds
accuracy:  0.9365853658536586
model 11
trained in 0.01799154281616211 seconds
accuracy:  0.9073170731707317
model 12
trained in 0.02104663848876953 seconds
accuracy:  0.8829268292682927
model 13
trained in 0.02681756019592285 seconds
accuracy:  0.9121951

In [29]:
best_accuracy

0.9463414634146341

In [24]:
params_to_optimize = {
    'min_members': [10, 20, 50],
    'max_depth': [2, 3, 4],
    'max_features': [1, 2],
    'n_trees': [50, 100, 150, 200]
}

In [25]:
best_accuracy, best_model = grid_search_random_forest(params_to_optimize, (None, 'gini', 'binary'), X_train, y_train, X_test, y_test)

model 1
trained in 0.8290102481842041 seconds
accuracy:  0.9463414634146341
model 2
trained in 1.7978174686431885 seconds
accuracy:  0.9463414634146341
model 3
trained in 2.6061887741088867 seconds
accuracy:  0.9609756097560975
model 4
trained in 3.46590256690979 seconds
accuracy:  0.9414634146341463
model 5
trained in 1.1942269802093506 seconds
accuracy:  0.9560975609756097
model 6
trained in 2.4093384742736816 seconds
accuracy:  0.975609756097561
model 7
trained in 3.780783176422119 seconds
accuracy:  0.9609756097560975
model 8
trained in 5.023874759674072 seconds
accuracy:  0.9609756097560975
model 9
trained in 0.9089550971984863 seconds
accuracy:  0.9658536585365853
model 10
trained in 1.7782471179962158 seconds
accuracy:  0.9658536585365853
model 11
trained in 2.7326273918151855 seconds
accuracy:  0.9658536585365853
model 12
trained in 3.676415205001831 seconds
accuracy:  0.9658536585365853
model 13
trained in 1.2329473495483398 seconds
accuracy:  0.9658536585365853
model 14
train

In [26]:
best_accuracy

0.975609756097561

In [27]:
best_model.n_trees, best_model.max_features, best_model.max_depth, best_model.min_members

(100, 2, 2, 10)