In [51]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [52]:
# Load dataset
# Breast cancer
breast_cancer = load_breast_cancer()

# Play tennis
df = pd.read_csv('play_tennis.csv')

# Transform
le = preprocessing.LabelEncoder()

# Data
play_tennis_data = {}
play_tennis_data['day'] = list(le.fit_transform(df['day']))
play_tennis_data['outlook'] = list(le.fit_transform(df['outlook']))
play_tennis_data['humidity'] = list(le.fit_transform(df['humidity']))
play_tennis_data['wind'] = list(le.fit_transform(df['wind']))
play_tennis_data['play'] = list(le.fit_transform(df['play']))

# Initialize label and target
play_tennis_label = []
play_tennis_target = []

# Inset label and target
for i in range(len(play_tennis_data['day'])):
    label = []
    label.append(play_tennis_data['outlook'][i])
    label.append(play_tennis_data['humidity'][i])
    label.append(play_tennis_data['wind'][i])
    
    play_tennis_label.append(label)
    play_tennis_target.append(play_tennis_data['play'][i])

# Feature names
play_tennis_data['feature_names'] = ['outlook', 'humidity', 'wind']

In [53]:
# Split training and test data
def split_data(label, target):
    x_train, x_test, y_train, y_test = train_test_split(label, target, test_size=0.2, train_size=0.8)
    return x_train, x_test, y_train, y_test

# Split breast cancer
bc_x_train, bc_x_test, bc_y_train, bc_y_test = split_data(breast_cancer.data, breast_cancer.target)

# Split play tennis
pt_x_train, pt_x_test, pt_y_train, pt_y_test = split_data(play_tennis_label, play_tennis_target)

In [54]:
import six
import sys
sys.modules['sklearn.externals.six'] = six 
from id3 import Id3Estimator, export_text

def id3Estimator(x_train, x_test, y_train, y_test, feature_names):
    # Fit
    estimator = Id3Estimator()
    estimator.fit(x_train, y_train)
    
    # Visualize
    r = export_text(estimator.tree_, feature_names)
    print(f"Tree{r}")
    
    # Predict
    pred = estimator.predict(x_test)
    
    # Score
    # Accuracy
    acc_score = accuracy_score(y_test, pred)
    # F1 Score
    f_score = f1_score(y_test, pred)
    print(f"Accuracy score : {acc_score}")
    print(f"F1 score       : {f_score}")

In [55]:
# Breast cancer
id3Estimator(bc_x_train, bc_x_test, bc_y_train, bc_y_test, breast_cancer.feature_names)

Tree
worst perimeter <=116.05
|   worst concave points <=0.11
|   |   mean area <=694.50: 1 (241) 
|   |   mean area >694.50
|   |   |   mean radius <=15.16: 0 (2) 
|   |   |   mean radius >15.16: 1 (2) 
|   worst concave points >0.11
|   |   worst texture <=27.62
|   |   |   mean concave points <=0.05: 1 (19) 
|   |   |   mean concave points >0.05
|   |   |   |   worst symmetry <=0.36
|   |   |   |   |   worst area <=808.00: 1 (12) 
|   |   |   |   |   worst area >808.00
|   |   |   |   |   |   mean radius <=15.54: 0 (4) 
|   |   |   |   |   |   mean radius >15.54: 1 (2) 
|   |   |   |   worst symmetry >0.36: 0 (4) 
|   |   worst texture >27.62
|   |   |   worst radius <=14.79
|   |   |   |   mean radius <=11.36: 0 (1) 
|   |   |   |   mean radius >11.36: 1 (5) 
|   |   |   worst radius >14.79
|   |   |   |   mean texture <=24.99: 0 (21) 
|   |   |   |   mean texture >24.99
|   |   |   |   |   mean radius <=13.41: 0 (1) 
|   |   |   |   |   mean radius >13.41: 1 (1) 
worst perimeter >

In [56]:
# Play tennis
id3Estimator(pt_x_train, pt_x_test, pt_y_train, pt_y_test, play_tennis_data['feature_names'])

Tree
outlook <=0.50: 1 (3) 
outlook >0.50
|   humidity <=0.50: 0 (4) 
|   humidity >0.50
|   |   wind <=0.50: 0 (1) 
|   |   wind >0.50: 1 (3) 

Accuracy score : 0.3333333333333333
F1 score       : 0.5


Logistic Regression

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
def logisticRegression(x_train, x_test, y_train, y_test):
    # Fit
    clf = LogisticRegression(random_state=0, max_iter=10000)
    clf.fit(x_train, y_train)
    
    # Visualisasi model
    #
    
    # Predict
    pred = clf.predict(x_test)
    
    # Score
    # Accuracy
    acc_score = accuracy_score(y_test, pred)
    # F1 Score
    f_score = f1_score(y_test, pred)
    
    print(f"Accuracy score : {acc_score}")
    print(f"F1 score       : {f_score}")

In [59]:
# Breast cancer
logisticRegression(bc_x_train, bc_x_test, bc_y_train, bc_y_test)

Accuracy score : 0.9210526315789473
F1 score       : 0.9395973154362416


In [50]:
# Play tennis
logisticRegression(pt_x_train, pt_x_test, pt_y_train, pt_y_test)

Accuracy score : 0.6666666666666666
F1 score       : 0.8
