In [1]:
import pandas as pd 
import numpy as np 
import functools


## First Dataset

In [2]:
data_1 = pd.DataFrame()
S, O, R = 'Sunny', 'Overcast', 'Rain'
data_1[    'Outlook'] = [S, S, O, R, R, R, O, S, S, R, S, O, O, R]
H, M, C = 'Hot', 'Mild', 'Cool'
data_1['Temperature'] = [H, H, H, M, C, C, C, M, C, M, M, M, H, M]
H, N = 'High', 'Normal'
data_1[   'Humidity'] = [H, H, H, H, N, N, N, H, N, N, N, H, N, H]
S, W = 'Strong', 'Weak'
data_1[       'Wind'] = [W, S, W, W, W, S, S, W, W, W, S, S, W, S]
Y, N = 'Yes', 'No'
data_1['Badminton'] = [N, N, Y, Y, Y, N, Y, N, Y, Y, Y, Y, Y, N]
data_1.to_csv('data_1.csv', index=False)
data_1

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


## Second Dataset

### Generating tree

In [3]:
data_1 = pd.DataFrame()
S, O, R = 'Sunny', 'Overcast', 'Rain'
data_1[    'Outlook'] = [S, S, O, R, R, R, O, S, S, R, S, O, O, R]
H, M, C = 'Hot', 'Mild', 'Cool'
data_1['Temperature'] = [H, H, H, M, C, C, C, M, C, M, M, M, H, M]
H, N = 'High', 'Normal'
data_1[   'Humidity'] = [H, H, H, H, N, N, N, H, N, N, N, H, N, H]
S, W = 'Strong', 'Weak'
data_1[       'Wind'] = [W, S, W, W, W, S, S, W, W, W, S, S, W, S]
Y, N = 'Yes', 'No'
data_1['Bandminton'] = [N, N, Y, Y, Y, N, Y, N, Y, Y, Y, Y, Y, N]
# data_1.to_csv('data_1.csv', index=False)
data_1

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [6]:
F, S = 'failure', 'success'
tree = (
    F, 'Founder Experience', {
        'low': (F, 'Second Opinion', {
            'positive': (S, None, None),
            'negative': (F, None, None),
        }),
        'moderate': (F, 'Competitive Advantage', {
            'yes': (S, None, None),
            'no': (F, None, None),
        }),
        'high': (S, 'Lucurative Market', {
            'yes': (S, None, None),
            'no': (F, None, None),
        }),
    }
)

### Sample features

In [30]:
seed = 2596459172
# seed = np.random.randint(0, 1<<32)
print(seed)
np.random.seed(seed)

zodiac = ['aquarius', 'pisces', 'aries', 'taurus', 'gemini', 'cancer', 'leo', 'virgo', 'libra', 'scorpio', 'sagittarius', 'capricorn']
feature_dist = {
    'Founder Zodiac': (zodiac, [1/12] * 12),
    'Founder Experience': (
        ['low', 'moderate', 'high'],
        [ 0.3,   0.4,        0.3 ],
    ),
    'Second Opinion': (
        ['positive', 'negative'], 
        [ 0.3, 0.7],
    ),
    'Competitive Advantage': (
        ['yes', 'no'],
        [0.3, 0.7],
    ),
    'Lucurative Market': (
        ['yes', 'no'],
        [ 0.3,  0.7]
    ),
}

X = pd.DataFrame(index=range(200))
for attr, (vals, probs) in feature_dist.items():
    X[attr] = np.random.choice(vals, p=probs, size=len(X))
X.head()

2596459172


Unnamed: 0,Founder Zodiac,Founder Experience,Second Opinion,Competitive Advantage,Lucurative Market
0,cancer,moderate,negative,yes,no
1,cancer,high,positive,yes,no
2,scorpio,low,negative,no,no
3,cancer,low,negative,no,no
4,aquarius,low,positive,yes,yes


### Generate noisy Labels

In [8]:
def generate_target(x, node=tree):
    mode, attr, children = node 
    if children and x[attr] in children:
        label = generate_target(x, node=children[x[attr]])
        return label
    else:
        return np.random.choice([mode, S, F], p=[0.8, 0.1, 0.1])
y = pd.DataFrame(X).T.apply(generate_target).T
y.value_counts()

failure    141
success     59
dtype: int64

In [46]:
import decision_tree_solution as dt 
def evaluate(X, y, **hparams):
    n1 = len(X) // 4
    n2 = len(X) // 2 
    model = dt.DecisionTree(**hparams)
    model.fit(X[:n1], y[:n1])
    print(f'Train: {dt.accuracy(y[:n1], model.predict(X[:n1]))}')
    print(f'Valid: {dt.accuracy(y[n1:n2], model.predict(X[n1:n2]))}')
    print(f'Test: {dt.accuracy(y[n2:], model.predict(X[n2:]))}')

hparams = dict()
hparams = dict(max_depth=2)
    
print('\nWith Zodiac')
evaluate(X, y, **hparams)

print('\nWithout Zodiac')
evaluate(X.drop(columns=['Founder Zodiac']), y, **hparams)



With Zodiac
Train: 0.94
Valid: 0.58
Test: 0.58

Without Zodiac
Train: 0.88
Valid: 0.94
Test: 0.91


### Export

In [35]:
data_2 = X.copy()
data_2['Result'] = y
data_2['Type'] = 'train'
data_2.loc[50:100, 'Type'] = 'valid'
data_2.loc[100:, 'Type'] = 'test'
data_2.to_csv('data_2.csv', index=False)
data_2

Unnamed: 0,Founder Zodiac,Founder Experience,Second Opinion,Competitive Advantage,Lucurative Market,Outcome,Split
0,cancer,moderate,negative,yes,no,success,train
1,cancer,high,positive,yes,no,failure,train
2,scorpio,low,negative,no,no,failure,train
3,cancer,low,negative,no,no,failure,train
4,aquarius,low,positive,yes,yes,success,train
...,...,...,...,...,...,...,...
195,capricorn,moderate,positive,no,yes,failure,test
196,aquarius,low,negative,no,yes,failure,test
197,cancer,moderate,negative,no,yes,failure,test
198,virgo,moderate,negative,no,no,failure,test


In [31]:
# y[0] = 'success'
pd.concat([X, y], axis=1).head(50)

Unnamed: 0,Founder Zodiac,Founder Experience,Second Opinion,Competitive Advantage,Lucurative Market,0
0,cancer,moderate,negative,yes,no,success
1,cancer,high,positive,yes,no,failure
2,scorpio,low,negative,no,no,failure
3,cancer,low,negative,no,no,failure
4,aquarius,low,positive,yes,yes,success
5,aquarius,moderate,positive,no,no,failure
6,scorpio,high,negative,no,yes,success
7,virgo,moderate,negative,no,yes,failure
8,cancer,high,positive,yes,no,failure
9,taurus,moderate,positive,no,yes,failure
