In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import matplotlib.pyplot as plt
import seaborn as sns

# Load the Dataset

In [None]:
!ls

In [3]:
data = pd.read_csv('column_3C.dat', header=None, sep=" ")
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,63.03,22.55,39.61,40.48,98.67,-0.25,DH
1,39.06,10.06,25.02,29.0,114.41,4.56,DH
2,68.83,22.22,50.09,46.61,105.99,-3.53,DH
3,69.3,24.65,44.31,44.64,101.87,11.21,DH
4,49.71,9.65,28.32,40.06,108.17,7.92,DH


In [6]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]

In [7]:
print(X.shape)
print(y.shape)

(310, 6)
(310,)


# Preprocessing of Labels

In [8]:
le = preprocessing.LabelEncoder()
le.fit(y)
le.classes_

array(['DH', 'NO', 'SL'], dtype=object)

In [10]:
def randomForest(data, n_estimator, test_percentage=0.2, random_state=42):
    train_set, test_set = train_test_split(data, test_size=test_percentage, random_state = random_state)
    train_X, train_y = train_set.iloc[:, :-1], train_set.iloc[:, -1]
    test_X, test_y = test_set.iloc[:, :-1], test_set.iloc[:, -1]
    train_y = le.transform(train_y)
    test_y = le.transform(test_y)
    
    # best parameters selected from assignment 1 to limit the tree
    forest = RandomForestClassifier(n_estimators=n_estimator, n_jobs=-1)
    
    forest.fit(train_X, train_y)
    prediction = forest.predict(test_X) 
    score = accuracy_score(prediction, test_y)
    return forest, score

# Number of Tree Components (n_estimators)

In [None]:
# 1 to 501 with 1 gap
with open('result_n_estimator.csv', 'w+') as f:
    for i in range(1, 502, 1):
        forest, score = randomForest(data, i)
        f.write(','.join([str(i), str(score), '\n']))

In [11]:
# find the best forest predictor in range(1, 21)
max_score = 0
best_forest = None
for i in range(1, 21):
    forest, score = randomForest(data, i)
    if score > max_score:
        best_forest = forest
        max_score = score

In [12]:
max_score

0.8709677419354839

In [None]:
dir(best_forest)

In [14]:
len(best_forest.estimators_)

8

# compare the performance of a tree and the forest

In [15]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state = 42)
train_X, train_y = train_set.iloc[:, :-1], train_set.iloc[:, -1]
test_X, test_y = test_set.iloc[:, :-1], test_set.iloc[:, -1]
train_y = le.transform(train_y)
test_y = le.transform(test_y)

In [None]:
with open('estimators_score.csv', 'w+') as f:
    # output score to csv for later drawing
    for estimator in best_forest.estimators_:
        prediction = estimator.predict(test_X)
        score = accuracy_score(prediction, test_y)
        f.write(''.join([str(score), '\n']))

# Feature Importance and Contribution

In [None]:
# feature importance of forest
best_forest.feature_importances_

In [None]:
# feature importances of each decision tree
importances = np.array(best_forest.estimators_[0].feature_importances_)
for estimator in best_forest.estimators_[1:]:
    importances = np.vstack((importances, np.array(estimator.feature_importances_)))

importances = np.vstack((importances, best_forest.feature_importances_))
ax = sns.heatmap(importances)
plt.show()

In [None]:
for i in range(8):
    x = list(range(6))
    plt.ylim((0, 1))
    plt.plot(x, importances[i], '.-', color=cmap(0.))
    
plt.plot(list(range(6)), importances[8], 'o-', color=cmap(1.))
cmap = plt.cm.coolwarm
custom_lines = [Line2D([0], [0], color=cmap(0.), lw=4),
                Line2D([0], [0], color=cmap(1.), lw=4)]
plt.legend(custom_lines, ['Decision Trees', 'Random Forest'])
plt.show()

# Naive Bayes Classification Model

In [23]:
gassiannb = GaussianNB()
nb = gassiannb.fit(train_X, train_y)
prediction = nb.predict(test_X)
best_gaussian_score = accuracy_score(prediction, test_y)

In [24]:
best_gaussian_score

0.8709677419354839

In [26]:
max_score

0.8709677419354839