Wei Lien Huang 40128391


In [287]:
from scipy.io import arff
import pandas as pd
import os                        # for os.path.exists
import json                      # for loading metadata
import urllib                    # for downloading remote files 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
import sklearn.tree
import sklearn.metrics
import sklearn.ensemble
import sklearn.preprocessing

In [320]:
def separateData(X,y):
    X_trn, X_tst, y_trn, y_tst = sklearn.model_selection.train_test_split(X, y, test_size=0.3, random_state=0)
    return X_trn, X_tst, y_trn, y_tst

Extract the following databases
1) [Diabetic Retinopathy ](https://archive.ics.uci.edu/ml/datasets/Diabetic+Retinopathy+Debrecen+Data+Set)

2) [Default of credit card clients](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients)

3) [Breast Cancer Wisconsin](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))

4) [Statlog (German credit data) (recommend german.doc for instructions and german-numeric for data.)](https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data))

5) [Adult](https://archive.ics.uci.edu/ml/datasets/adult)

6) [Yeast](https://archive.ics.uci.edu/ml/datasets/Yeast)

7) [Thoracic Surgery Data](https://archive.ics.uci.edu/ml/datasets/Thoracic+Surgery+Data)

8) [Seismic-Bumps](https://archive.ics.uci.edu/ml/datasets/seismic-bumps)


# 1)  Start with the Diabetic Retinopathy data

In [324]:
# loading the Diabetic Retinopathy data that is downloaded 
data = arff.loadarff('resources/messidor_features.arff')
df = pd.DataFrame(data[0])
data = data[0]
data = data[0:len(data)]
data_list = []
# extract the data and conver
for x in data:
    x_data = list(x)
    data_list.append(x_data)

X_list = []
y_list = []
# split the data in a way that it can be converted to numpy array
for i in data_list:
    X_list.append(i[0:19]) 
    y_list.append(i[19:])

X = np.array(X_list)
y = np.array(y_list)
X_trn_diabetic, X_tst_diabetic, y_trn_diabetic, y_tst_diabetic = separateData(X,y)

# 2) Default of credit card clients

In [325]:
data = pd.read_excel (r'resources/default of credit card clients.xls')
X = np.array(data)[1:,1:-1]
y = np.array(data)[1:,-1:]
X_trn_credit, X_tst_credit, y_trn_credit, y_tst_credit = separateData(X,y)

# 3) Breast Cancer Wisconsin 

- if y = 2 --> benign    we will change this value to 0

- if y = 4 --> malignant we will change this value to 1

In [327]:
data = pd.read_csv('resources/breast-cancer-wisconsin.data', header=None)
X = np.array(data)[:,0:-1]
y = np.array(data)[:,-1:]
y_lis = []
for ele in y:
    if ele == 2:
        y_lis.append(0)
    if ele == 4:
        y_lis.append(1)
y = np.array(y_lis).reshape(-1,1)

X_trn_breast, X_tst_breast, y_trn_breast, y_tst_breast = separateData(X,y)

# 4) Statlog (German credit data) (recommend german.doc for instructions and german-numeric for data.)

In [329]:
data = pd.read_csv('resources/german.data-numeric', header=None,  delim_whitespace=True)
X = np.array(data)[:,0:-1]
y = np.array(data)[:,-1:].reshape(-1,1)
X_trn_german, X_tst_german, y_trn_german, y_tst_german = separateData(X,y)

# 5) Adult

In [354]:
data = pd.read_csv('resources/adult.data', header=None)
X = np.array(data)[:,0:-1]
y = np.array(data)[:,-1:].reshape(-1,1)
y_lis = []
for ele in y:
    if ele == ' <=50K':
        y_lis.append(0)
    if ele == ' >50K':
        y_lis.append(1)
y = np.array(y_lis).reshape(-1,1)
X_trn_adult, X_tst_adult, y_trn_adult, y_tst_adult = separateData(X,y)

# 6) Yeast

In [358]:
data = pd.read_csv('resources/yeast.data', header=None, delim_whitespace=True)

X1 = np.array(data)[:,1:5]
X2 = np.array(data)[:,5:9]
X = np.hstack((X1,X2))
y = np.array(data)[:,6:7]
X_trn_yeast, X_tst_yeast, y_trn_yeast, y_tst_yeast = separateData(X,y)

In [219]:
# define the train_estimators to train the required estimators
def train_estimators(X, y, estimator_type, param_name, param_vals, **kwargs):
    trainned_estimators = []
    for i in range(0,len(param_vals)):
        clf = estimator_type(**kwargs)
        clf.set_params(**{param_name:param_vals[i]})
        clf.fit(X,y)
        trainned_estimators.append(clf)
        print('Training ', clf, '...')
    return trainned_estimators

def score_estimators(X, y, estimators):
    accuracy_scores = []
    for i in range(0, len(estimators)):
        accuracy_scores.append(roundVal(estimators[i].score(X,y)))
    return accuracy_scores
        
def plot_estimator_scores(estimators, param_name, param_vals):
    x_arange = np.arange(len(param_vals))
    
    # compute the score of each data and store them as separate list
    y_ax_trn = score_estimators(X_trn, y_trn, estimators)
    y_ax_tst = score_estimators(X_tst, y_tst, estimators)

    # plot each of the ata
    plt.plot(x_arange, y_ax_tst, color='black', linestyle = 'dashed', label = "test");
    plt.plot(x_arange, y_ax_trn, color='green', marker='o', label = "train");
    plt.xticks(x_arange,param_vals)

    plt.xlabel(param_name)
    plt.ylabel('score')
    plt.title(str(estimators[0].__class__.__name__) + ' vs ' + param_name)
    plt.legend()
    plt.text(3.5, 0.4, 'train = ' + str(max(y_ax_trn)) , fontsize=10, color = 'green')
    plt.text(3.5, 0.2, 'test = ' + str(max(y_ax_tst)), fontsize=10, color = 'black')
    plt.ylim([0,1.1])
    plt.show()
    
def roundVal(val):
    rounded = round(val,3)
    return rounded

In [220]:
# train the data with LogisticRegression
logistic_estimators = train_estimators(X_trn, y_trn.ravel(), sklearn.linear_model.LogisticRegression,
                                   'C', [0.00001, 0.0001, 0.001, 0.01, 0.1, 1], max_iter=10000,  random_state=0)
plot_estimator_scores(logistic_estimators, 'C', [0.00001, 0.0001, 0.001, 0.01, 0.1, 1])


NameError: name 'X_trn' is not defined

In [None]:
# train with svm classifier
svm_estimators = train_estimators(X_trn, y_trn.ravel(), sklearn.svm.SVC,
                                   'C', [0.01, 0.1, 1, 10.0, 100.0, 1000.0], gamma = 0.001, max_iter=10000,  random_state=0)

In [None]:
plot_estimator_scores(svm_estimators, 'C', [0.01, 0.1, 0, 10.0, 100.0, 1000.0])

In [None]:
tree_estimators = train_estimators(X_trn, y_trn, sklearn.tree.DecisionTreeClassifier,
                                   'max_depth', [1, 5, 10], splitter='random', random_state=0)

In [None]:
#plot_estimator_scores(tree_estimators, 'max_depth', [1, 5, 10, 20, 50, 100])

In [None]:
randomForest_estimators = train_estimators(X_trn, y_trn.ravel(), sklearn.ensemble.RandomForestClassifier,
                                   'max_depth', [1, 5, 10, 20, 50, 100],  random_state=0)

In [None]:
plot_estimator_scores(randomForest_estimators, 'max_depth', [1, 5, 10, 20, 50, 100])

In [None]:

neighbors_estimators = train_estimators(X_trn, y_trn.ravel(), sklearn.neighbors.KNeighborsRegressor,
                                   'n_neighbors', [5, 15, 20, 50, 100]) #[1, 5, 10, 20, 50, 100]

In [None]:
plot_estimator_scores(neighbors_estimators, 'n_neighbors', [5, 15, 20, 50, 100])
