In [1]:
import pandas as pd
import numpy as np
import csv

import requests
from io import StringIO

from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text

import matplotlib.pyplot as plt
%matplotlib notebook

import warnings
warnings.filterwarnings('ignore')

# Problem A

## Imputing missing value

In [2]:
url_1 = 'https://archive.ics.uci.edu/ml/machine-learning-databases' \
    '/credit-screening/crx.data'
attributes_1 = (
    'A1', 'A2', 'A3', 
    'A4', 'A5', 'A6', 
    'A7', 'A8', 'A9', 
    'A10', 'A11', 'A12',
    'A13', 'A14', 'A15',
    'class')
df_1 = pd.read_csv(
    StringIO(requests.get(url_1).content.decode('utf-8')), names = attributes_1)
df_1

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,f,g,00202,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,f,g,00043,560,+
2,a,24.50,0.500,u,g,q,h,1.50,t,f,0,f,g,00280,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,t,g,00100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,00120,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,00260,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,t,g,00200,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,t,g,00200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,00280,750,-


In [3]:
df_1_1 = df_1.replace('?', np.nan) 

In [4]:
df_1_1[['A2','A14']] = df_1_1[["A2",'A14']].astype(float)

In [5]:
df_1_1.mean()

A2       31.568171
A3        4.758725
A8        2.223406
A11       2.400000
A14     184.014771
A15    1017.385507
dtype: float64

In [6]:
df_1_1.isnull().sum()

A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
class     0
dtype: int64

In [7]:
values = {'A1':df_1_1["A1"].mode()[0], 'A2':df_1_1["A2"].mean(), 'A3':df_1_1["A3"].mean(), 
    'A4':df_1_1["A4"].mode()[0], 'A5':df_1_1["A5"].mode()[0], 'A6':df_1_1["A6"].mode()[0], 
    'A7':df_1_1["A7"].mode()[0], 'A8':df_1_1["A8"].mean(), 'A9':df_1_1["A9"].mode()[0], 
    'A10':df_1_1["A10"].mode()[0], 'A11':df_1_1["A11"].mean(), 'A12':df_1_1["A12"].mode()[0],
    'A13':df_1_1["A13"].mode()[0], 'A14':df_1_1["A14"].mean(), 'A15':df_1_1["A15"].mean()}

In [8]:
df_1_1 = df_1_1.fillna(values)

In [9]:
df_1_1.isnull().sum()

A1       0
A2       0
A3       0
A4       0
A5       0
A6       0
A7       0
A8       0
A9       0
A10      0
A11      0
A12      0
A13      0
A14      0
A15      0
class    0
dtype: int64

In [10]:
df_1_final = pd.get_dummies(df_1_1, prefix=['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 
                                            'A13','class'])

In [11]:
df_1_final[['A2','A3','A8','A11','A14','A15']] = df_1_final[['A2','A3','A8','A11','A14','A15']] / df_1_final[['A2','A3','A8','A11','A14','A15']].max()

In [12]:
df_1_final = df_1_final.sample(frac = 1, random_state = 7021).reset_index(drop = True)
df_1_final = df_1_final.drop(columns = ['class_-','A1_a','A4_u','A5_g','A6_d','A7_bb','A9_t','A10_f','A12_t','A13_g'])

## Split set

In [13]:
N_train_1 = round(0.75 * df_1_final.shape[0])
N_train_1

518

In [14]:
X_1 = df_1_final.iloc[:,:-1]
y_1 = df_1_final.iloc[:, -1]
X_train_1 = X_1.iloc[0:N_train_1,:]
X_test_1 = X_1.iloc[N_train_1:,:]
y_train_1 = y_1.iloc[:N_train_1]
y_test_1 = y_1.iloc[N_train_1:]

In [15]:
N_1, P_1 = X_train_1.shape