## Data Processing

In [1]:
import pandas as pd
import numpy as np
from numpy import log2
file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
df = pd.read_csv(file_url)

train_idx = np.load('train_idx.npy')
test_idx = np.load('test_idx.npy')

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

# drop some unwanted rows
# train_df = train_df.drop([248, 250, 251]) # cp = 0

In [2]:
train_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
136,54,1,2,192,283,0,2,195,0,0.0,1,1,reversible,0
232,58,0,4,170,225,1,2,146,1,2.8,2,2,fixed,1
233,56,1,2,130,221,0,2,163,0,0.0,1,0,reversible,0
184,46,1,4,120,249,0,2,144,0,0.8,1,0,reversible,0
84,55,0,2,135,250,0,2,161,0,1.4,2,0,normal,0


In [3]:
train_df.info()
# test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 201 entries, 136 to 15
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       201 non-null    int64  
 1   sex       201 non-null    int64  
 2   cp        201 non-null    int64  
 3   trestbps  201 non-null    int64  
 4   chol      201 non-null    int64  
 5   fbs       201 non-null    int64  
 6   restecg   201 non-null    int64  
 7   thalach   201 non-null    int64  
 8   exang     201 non-null    int64  
 9   oldpeak   201 non-null    float64
 10  slope     201 non-null    int64  
 11  ca        201 non-null    int64  
 12  thal      201 non-null    object 
 13  target    201 non-null    int64  
dtypes: float64(1), int64(12), object(1)
memory usage: 23.6+ KB


In [3]:
col = 'thal'
print(train_df[col].value_counts(sort=False))
print(test_df[col].value_counts(sort=False))
# categorical: sex, cp, fbs, restecg, exang, thal
# continous: age, trestbps, chol, thalach, oldpeak, slope, ca
# one-hot-encoding: cp, restecg, thal

reversible     79
normal        115
fixed           7
Name: thal, dtype: int64
reversible    36
normal        53
fixed         11
Name: thal, dtype: int64


In [9]:
encoding_list = ['cp', 'restecg', 'thal']
for feature in encoding_list:
    train_df = pd.get_dummies(train_df, columns=[feature], prefix = [feature])
    test_df = pd.get_dummies(test_df, columns=[feature], prefix = [feature])

In [10]:
train_df.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,...,cp_1,cp_2,cp_3,cp_4,restecg_0,restecg_1,restecg_2,thal_fixed,thal_normal,thal_reversible
136,54,1,192,283,0,195,0,0.0,1,1,...,0,1,0,0,0,0,1,0,0,1
232,58,0,170,225,1,146,1,2.8,2,2,...,0,0,0,1,0,0,1,1,0,0
233,56,1,130,221,0,163,0,0.0,1,0,...,0,1,0,0,0,0,1,0,0,1
184,46,1,120,249,0,144,0,0.8,1,0,...,0,0,0,1,0,0,1,0,0,1
84,55,0,135,250,0,161,0,1.4,2,0,...,0,1,0,0,0,0,1,0,1,0


In [197]:
features = list(train_df.columns)
features.remove('target')
X_train_df, y_train_df = train_df[features], train_df['target']
X_test_df, y_test_df = test_df[features], test_df['target']

In [198]:
# X_train_df.dtypes.values
# y_train_df.dtypes

In [199]:
X_train_dt = np.dtype({'names': features, 'formats': X_train_df.dtypes.values})
X_test_dt = np.dtype({'names': features, 'formats': X_test_df.dtypes.values})

In [194]:
from sklearn.model_selection import train_test_split
X_train_tmp, X_test_tmp = train_test_split(X_train_df, test_size=0.3)

pandas.core.frame.DataFrame

### Stuctured np array
- https://jakevdp.github.io/PythonDataScienceHandbook/02.09-structured-data-numpy.html

In [200]:
tmp=np.array([tuple(v) for v in X_train_df.values.tolist()], dtype=X_train_dt)
print(tmp['age'])
print(tmp[0])
print(tmp[tmp['age']<30])

[54 58 56 46 55 44 61 59 62 54 64 64 62 66 60 55 44 54 63 46 58 64 41 62
 56 61 62 61 40 46 47 63 65 44 57 69 59 54 43 50 53 44 42 42 42 57 56 58
 41 58 48 59 59 34 62 48 66 62 38 74 62 53 63 67 41 52 61 40 45 37 66 43
 61 54 77 67 43 67 45 54 56 46 59 53 41 48 58 58 66 54 48 52 59 65 57 41
 60 34 56 52 71 50 60 44 51 52 58 42 58 57 35 54 64 65 65 54 41 65 64 57
 60 47 47 62 57 60 54 58 65 51 49 56 44 51 46 55 54 46 51 51 60 41 54 49
 37 54 46 39 49 63 55 62 58 56 67 44 60 49 59 29 54 62 35 59 51 60 42 45
 65 41 43 55 57 67 71 50 67 66 51 57 55 51 45 55 54 52 68 39 48 52 35 52
 60 50 51 64 44 57]
(54, 1, 192, 283, 0, 195, 0, 0., 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1)
[(29, 1, 130, 204, 0, 202, 0, 0., 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0)]


In [215]:
# X_train, y_train = X_train.to_numpy(), y_train.to_numpy()
# X_test, y_test = X_test.to_numpy(), y_test.to_numpy()
X_train, y_train = X_train_df.to_numpy(), y_train_df.to_numpy()
X_test, y_test = X_test_df.to_numpy(), y_test_df.to_numpy()

In [216]:
# data = np.append(X_train, np.resize(y_train, (len(y_train),1)), axis=1)
# print(data)
X_train

array([[ 54.,   1., 192., ...,   0.,   0.,   1.],
       [ 58.,   0., 170., ...,   1.,   0.,   0.],
       [ 56.,   1., 130., ...,   0.,   0.,   1.],
       ...,
       [ 64.,   1., 170., ...,   0.,   0.,   1.],
       [ 44.,   1., 130., ...,   0.,   1.,   0.],
       [ 57.,   1., 150., ...,   0.,   1.,   0.]])

In [218]:
idx = np.argsort(X_train[:, 2])
print(X_train[idx])
print(y_train[idx])

[[ 67.   1. 100. ...   0.   1.   0.]
 [ 58.   0. 100. ...   0.   1.   0.]
 [ 46.   1. 101. ...   0.   0.   1.]
 ...
 [ 55.   0. 180. ...   0.   1.   0.]
 [ 54.   1. 192. ...   0.   0.   1.]
 [ 56.   0. 200. ...   0.   0.   1.]]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0
 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0
 1 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 1 0 0 0 0
 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1
 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 1 0 1 0 1 1 0 0 1 1 1 0 1]


In [210]:
np.bincount(X_train[:, -3].astype('uint8'))

array([191,   7])

In [None]:
np.sort(a, order='name')

In [8]:
tmp = test_df[(test_df['thal'] == 'fixed')]
# tmp
tmp['target'].value_counts()

1    7
0    4
Name: target, dtype: int64

In [359]:
print(11/3)
print(18/7)
print(102/13)

3.6666666666666665
2.5714285714285716
7.846153846153846


In [None]:
# test_df[(test_df['cp'] == 1)]

In [207]:
train_df.columns

Index(['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak',
       'slope', 'ca', 'target', 'cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0',
       'restecg_1', 'restecg_2', 'thal_fixed', 'thal_normal',
       'thal_reversible'],
      dtype='object')

In [208]:
train_df['thal_normal'].value_counts(sort=False)

0     86
1    112
Name: thal_normal, dtype: int64

In [211]:
test_df['thal_fixed'].value_counts(sort=False)

0    89
1    11
Name: thal_fixed, dtype: int64

In [None]:
# check null values
train_df.isnull().sum()

In [77]:
train_df.values.shape

(198, 14)

In [106]:
print(X_train)
# print(data[: ,0])

[[54 1 2 ... 1 1 'reversible']
 [58 0 4 ... 2 2 'fixed']
 [56 1 2 ... 1 0 'reversible']
 ...
 [64 1 1 ... 2 0 'reversible']
 [44 1 3 ... 1 0 'normal']
 [57 1 3 ... 1 0 'normal']]


### sort 2d array (numpy)
https://opensourceoptions.com/blog/sort-numpy-arrays-by-columns-or-rows/

In [34]:
a = np.random.randint(100, size=(5, 4))
print(a)

[[44 18 25 73]
 [17 84 64 47]
 [70 80 17  1]
 [31 27 62 28]
 [38 22 90 77]]


In [36]:
a = a[a[:, 3].argsort()]
print(a)

[[70 80 17  1]
 [31 27 62 28]
 [17 84 64 47]
 [44 18 25 73]
 [38 22 90 77]]


### append a column to np arr

In [41]:
X = np.random.randint(100, size=(5, 4))
Y = np.random.randint(100, size=(5,))
print(X)
print(Y)

[[46 94 88 66]
 [ 6 21 68 81]
 [22 25 34 31]
 [60 61 46 71]
 [67 38 87 31]]
[89 70 86 54 54]


In [42]:
Y.resize((len(Y),1))
print(Y)

[[89]
 [70]
 [86]
 [54]
 [54]]


In [43]:
data = np.append(X, Y, axis=1)
print(data)

[[46 94 88 66 89]
 [ 6 21 68 81 70]
 [22 25 34 31 86]
 [60 61 46 71 54]
 [67 38 87 31 54]]


### numpy histogram

In [46]:
np.histogram([1, 2, 1], bins=[0, 1, 2])

(array([0, 3]), array([0, 1, 2]))

In [50]:
tmp = np.unique([1.4, 4.3, 4, 10, 0, 0.0])
len(tmp)

5

In [235]:
cls_list = np.bincount([0,0,0,1,1,1,1], minlength=2)
print(cls_list)
print(np.argmax(cls_list))

[3 4]
1


### SELECT condition numpy

In [219]:
X = np.random.randint(100, size=(5, 4))
print(X)

[[68 77 90 83]
 [52 67 66 18]
 [65  2 99 26]
 [32 48 40 97]
 [18 95 74 23]]


In [227]:
mask = X[:, 1] < 10
print(mask)
print(np.logical_not(mask))
print(mask==False)

filter_X = X[mask, :]
print(filter_X.shape)

[False False  True False False]
[ True  True False  True  True]
[ True  True False  True  True]
(1, 4)


## Pandas code snippets

In [None]:
df.info()

In [None]:
features = df.columns
print(features)
print(len(features))
print(df.index)

In [None]:
df.sort_values('thalach', inplace=True)
print(df.index)

In [None]:
# print(df. iloc[:, 17])
# df.loc[:, features]
df['sex']

In [None]:
print(df.loc[df['age'] < 50, :])
# print(len(df.loc[df['age'] >= 50, :]))
# print(df.loc[df['sex'] == 0 , :])

In [None]:
for index, row in df.iterrows():
    print(row['age'])
    break


In [None]:
def gini(sequence):
    if len(sequence) == 0:
        return 0
    
    if np.unique(sequence).shape[0] == 1:  # pure
        return 0

    hist = np.bincount(sequence) / sequence.shape[0]
    return 1 - np.sum(hist ** 2)

print(gini(df['target'].values))
print(gini(np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0], dtype='int64')))

In [None]:
feature_val = df['ca'].value_counts(sort=False).index.values
print(feature_val)
np.array_equal(feature_val, [0, 1])

## Test code snippet

In [None]:
def func(df):
    print(id(df))
    df = df.sort_values('thalach')
    print(id(df)) #copy
    
print(id(df))
func(df)
print(df['thalach'])

In [None]:
def func(arr):
    arr[3] = 100 # change the original array
    
arr = np.array([1,4,3,2])
func(arr)
print(arr)

In [None]:
i = j = 1
print(id(i), id(j))

j = 10
print(i, j)
print(id(i), id(j))


In [244]:
def sample_weight_gini(Y: np.ndarray, sample_weight: np.ndarray):
    cls_list = np.unique(Y)

    if len(Y) == 0:
        return 0
    if len(cls_list) == 1:  # pure
        return 0

    cls_count = np.zeros(len(cls_list))

    for cls in cls_list:
        cls_count[cls] = sample_weight[Y == cls].sum()

    hist = cls_count / sample_weight.sum()
    print(hist)
    return 1 - np.sum(hist ** 2)

In [255]:
print(sample_weight_gini(np.array([0,0,1,1,1]), np.array([0.1, 1.3, 0.1, 0.1, 0.1])))

[0.82352941 0.17647059]
0.290657439446367


In [256]:
1.4/(1.4+0.3)

0.8235294117647058

In [257]:
gini = lambda Y, sample_weight: sample_weight_gini(Y, sample_weight)
print(gini(np.array([0,0,1,1,1]), np.array([0.1, 1.3, 0.1, 0.1, 0.1])))

[0.82352941 0.17647059]
0.290657439446367


In [296]:
np.ones(10) / 10

array([0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])

In [318]:
np.random.choice(10, size=4, replace=True)

array([0, 9, 7, 7])

In [301]:
error=2.721471623028993e-100
np.log((1 - error) / error)

229.25733652752658

In [325]:
X_train[np.arange(len(X_train))]

array([[ 54.,   1., 192., ...,   0.,   0.,   1.],
       [ 58.,   0., 170., ...,   1.,   0.,   0.],
       [ 56.,   1., 130., ...,   0.,   0.,   1.],
       ...,
       [ 64.,   1., 170., ...,   0.,   0.,   1.],
       [ 44.,   1., 130., ...,   0.,   1.,   0.],
       [ 57.,   1., 150., ...,   0.,   1.,   0.]])

In [326]:
X_train

array([[ 54.,   1., 192., ...,   0.,   0.,   1.],
       [ 58.,   0., 170., ...,   1.,   0.,   0.],
       [ 56.,   1., 130., ...,   0.,   0.,   1.],
       ...,
       [ 64.,   1., 170., ...,   0.,   0.,   1.],
       [ 44.,   1., 130., ...,   0.,   1.,   0.],
       [ 57.,   1., 150., ...,   0.,   1.,   0.]])

In [13]:
seed = 2020
for i in range(15):
    np.random.seed(seed)
    seed+=1
    print(np.random.choice(10, size=4, replace=True))

[0 8 3 6]
[4 5 9 0]
[0 1 1 0]
[7 9 6 7]
[8 0 0 4]
[2 8 3 3]
[1 6 8 4]
[9 3 8 8]
[2 6 3 3]
[6 2 3 0]
[5 3 3 9]
[6 2 0 0]
[6 6 2 8]
[3 1 4 2]
[5 7 5 6]
