## Introduction to Decision Trees

### Explore the dataset
We'll be looking at individual income in the United States. The data is from the 1994 census, and contains information on an individual's marital status, age, type of work, and more. The target column, or what we want to predict, is whether individuals make less than or equal to 50k a year, or more than 50k a year.

In [2]:
import pandas as pd
import numpy as np
import math

In [3]:
import pandas

# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age)
income = pandas.read_csv("income.csv", index_col=False)

In [4]:
print(income.head(5))

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0          2174             0              40   United-States   

In [5]:
income.shape

(32561, 15)

In [6]:
income.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'high_income'],
      dtype='object')

### Converting the categorical variables to numeric variables

In [7]:
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                    'race', 'sex', 'native_country', 'high_income']


In [8]:
# Convert a single column from text categories to numbers
col = pandas.Categorical(income["workclass"])
income["workclass"] = col.codes
print(income["workclass"].head(5))

0    7
1    6
2    4
3    4
4    4
Name: workclass, dtype: int8


In [9]:
for cat in categorical_cols:
    income[cat] = pd.Categorical(income[cat]).codes

In [10]:
print(income['sex'].head())

0    1
1    1
2    1
3    1
4    0
Name: sex, dtype: int8


### Creating Splits in Data

In [11]:
income_filter = income['workclass'] ==4
private_incomes = income[income_filter]
private_incomes.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0


In [12]:
private_incomes.shape

(22696, 15)

In [13]:
public_incomes = income[income_filter == False]
public_incomes.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1
11,30,7,141297,9,13,2,10,0,1,1,0,0,40,19,1
16,25,6,176756,11,9,4,5,3,4,1,0,0,35,39,0


In [14]:
public_incomes.shape

(9865, 15)

### Entropy Calculation

In [15]:
low_income_prob = income[income["high_income"] == 0].shape[0] / income.shape[0]
high_income_prob = income[income["high_income"] == 1].shape[0] / income.shape[0]
low_income_prob

0.7591904425539756

In [16]:
income_entropy = -(low_income_prob * math.log(low_income_prob, 2) + high_income_prob * math.log(high_income_prob, 2))
income_entropy

0.7963839552022132

### Information Gain

In [29]:
# A function to calculate entropy

def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [30]:
import numpy as np
# compute the median of age
mean_age = income['age'].median()

#Then, assign anything less than or equal to the median to the left branch, and anything greater than the median to the right branch.
income['split_age'] = np.where((income['age'] > mean_age),1,0) 

In [31]:
#probability for left and right split
left_split_prob = np.sum(income['split_age']==0)/income.shape[0]
right_split_prob = np.sum(income['split_age']==1)/income.shape[0]

#left = income[income["age"] <= income["age"].median()]
#right = income[income["age"] > income["age"].median()]

In [32]:
#first calculate entropy of high_income i.e the target
income_entropy = calc_entropy(income['high_income'])
income_entropy

0.7963839552022132

In [38]:
age_information_gain = income_entropy 
- (left_split_prob*calc_entropy(income[income['split_age']==0]['high_income']) 
+ right_split_prob*calc_entropy(income[income['split_age']==1]['high_income']))

-0.7493552938975212

In [39]:
age_information_gain

0.7963839552022132

### Finding the best variable to split a node 

In [40]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

# Verify that our answer is the same as on the last screen
print(calc_information_gain(income, "age", "high_income"))


0.047028661304691965


In [42]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

#list to contain information gain from different column with the objective to find best variable(which is having higets information gain) to split the data
information_gains =[]

for col in columns:
    information_gain = calc_information_gain(income, col, 'high_income')
    
    #fill the list
    information_gains.append(information_gain)
    #information_gains[col] = information_gain
    
information_gains   

[0.047028661304691965,
 0.006810984054396618,
 0.06501298413277423,
 0.1114272573715438,
 0.0015822303843424645,
 0.04736241665026941,
 0.0,
 0.0,
 0.04062246867123487,
 0.00013457344495848567]

In [43]:
#find the column with higest information_gain
max_gain_index = information_gains.index(max(information_gains))
max_gain_index

3

In [44]:
#if information_gains is a dictionary
#highest_gain = max(information_gains, key = information_gains.get)
highest_gain = columns[max_gain_index]
highest_gain

'marital_status'