# Imports

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd 
import time
from math import log

------------------------------

# Helper Function

In [2]:
def set_array(inps):
    '''
    @param inps: sliced data frame
    Converts dataframe into numpy array for easy work out
    '''
    inps = np.asarray(inps)
    #squeeze to remove indexing column from pandas
    return np.squeeze(inps)

def entropy(target_col, n_class):
    elements,counts = np.unique(target_col,return_counts = True)
    ent = 0
    for i in range(len(elements)):
        ent += (-counts[i]/np.sum(counts))*log(counts[i]/np.sum(counts), n_class)
    return ent
    

# Data Processing

In [3]:
input_path = 'car.csv'
input_ds = pd.read_csv(input_path, header = None)
n_cols = len(input_ds.columns)
print("Number of columns:", n_cols)

Number of columns: 7


## Naming columns 

In [4]:
#initiate empty list for column name
col_name = []

for i in range(n_cols):
    #last column is the target value
    if (i == n_cols-1):
        col_name.append('class')
    else:
        col_name.append('att{}'.format(i))
        
print(col_name)

['att0', 'att1', 'att2', 'att3', 'att4', 'att5', 'class']


In [5]:
#assigning column names to data set
input_ds.columns = col_name
print (input_ds.head())

    att0   att1 att2 att3   att4  att5  class
0  vhigh  vhigh    2    2  small   low  unacc
1  vhigh  vhigh    2    2  small   med  unacc
2  vhigh  vhigh    2    2  small  high  unacc
3  vhigh  vhigh    2    2    med   low  unacc
4  vhigh  vhigh    2    2    med   med  unacc


In [6]:
arr = input_ds.iloc[:,0:n_cols-1]
value = input_ds.iloc[:,n_cols-1]

att = set_array(att)
value = set_array(value)

In [7]:
for i in range(att.shape[1]):
    print("Attributes in att{} is: {}".format(i, set(att[:,i])))

n_class = len(set(value))
print("Attributes in value is:", set(value))
print("Number of class is: ", n_class)

Attributes in att0 is: {'vhigh', 'low', 'high', 'med'}
Attributes in att1 is: {'vhigh', 'low', 'high', 'med'}
Attributes in att2 is: {'4', '5more', '3', '2'}
Attributes in att3 is: {'more', '4', '2'}
Attributes in att4 is: {'small', 'med', 'big'}
Attributes in att5 is: {'med', 'low', 'high'}
Attributes in value is: {'vgood', 'acc', 'good', 'unacc'}
Number of class is:  4


In [8]:
entropy(value, n_class)

0.6028704850060875

In [9]:
vals,counts= np.unique(att[:,5],return_counts=True)

In [22]:
test_02 = np.stack((att[:,5], value)).reshape(-1,2)
print(test_02)

[['low' 'med']
 ['high' 'low']
 ['med' 'high']
 ...
 ['unacc' 'good']
 ['vgood' 'unacc']
 ['good' 'vgood']]


In [24]:
tmp_02 = np.where(test_02[:,0] == vals[0], test_02[:,1], 0)
tmp_02 = tmp_02[tmp_02!=0]
print(tmp_02)

['low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low'
 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'low' 'l

In [10]:
w_entropy = 0
for i in vals:
    split_att = np.where(att[:,5]== i, att[:,5], 0)
    split_att = split_att[split_att!=0]
    w_entropy += entropy(split_att,n_class)

In [11]:
test = np.stack((att[:,5], value))
tmp = np.where(test[0]== vals[0], test, 0)
tmp = tmp[tmp!=0]
print(tmp)

['high' 'high' 'high' ... 'good' 'vgood' 'vgood']


In [12]:
tmp = np.where(att[:,5]== vals[0], att[:,5], 0)
tmp = tmp[tmp!=0]

-----------------------------------------------

In [18]:
input_ds.where(input_ds['att5']==vals[0]).dropna()['class']

2       unacc
5       unacc
8       unacc
11      unacc
14      unacc
17      unacc
20      unacc
23      unacc
26      unacc
29      unacc
32      unacc
35      unacc
38      unacc
41      unacc
44      unacc
47      unacc
50      unacc
53      unacc
56      unacc
59      unacc
62      unacc
65      unacc
68      unacc
71      unacc
74      unacc
77      unacc
80      unacc
83      unacc
86      unacc
89      unacc
        ...  
1640    unacc
1643     good
1646    vgood
1649    unacc
1652    unacc
1655    unacc
1658     good
1661     good
1664    vgood
1667     good
1670    vgood
1673    vgood
1676    unacc
1679    unacc
1682    unacc
1685     good
1688    vgood
1691    vgood
1694     good
1697    vgood
1700    vgood
1703    unacc
1706    unacc
1709    unacc
1712     good
1715    vgood
1718    vgood
1721     good
1724    vgood
1727    vgood
Name: class, Length: 576, dtype: object