In [5]:
import pandas

names = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", 
         "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", 
         "hours_per_week", "native_country", "high_income"]
income = pandas.read_csv("income.csv", header=None, names=names, index_col=False)
print(income.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0          2174             0              40   United-States   

In [6]:
# Convert the categorical variables in our dataset to numeric variables
# We can use the Categorical.from_array method from Pandas to perform the conversion to numbers
convert_list = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 
                'race', 'sex', 'native_country', 'high_income']
for column in convert_list:
    col = pandas.Categorical.from_array(income[column])
    income[column] = col.codes
    print(income[column].head())

0    7
1    6
2    4
3    4
4    4
Name: workclass, dtype: int8
0     9
1     9
2    11
3     1
4     9
Name: education, dtype: int8
0    4
1    2
2    0
3    2
4    2
Name: marital_status, dtype: int8
0     1
1     4
2     6
3     6
4    10
Name: occupation, dtype: int8
0    1
1    0
2    1
3    0
4    5
Name: relationship, dtype: int8
0    4
1    4
2    4
3    2
4    2
Name: race, dtype: int8
0    1
1    1
2    1
3    1
4    0
Name: sex, dtype: int8
0    39
1    39
2    39
3    39
4     5
Name: native_country, dtype: int8
0    0
1    0
2    0
3    0
4    0
Name: high_income, dtype: int8


In [8]:
# Compute the entropy of the high_income column in the income dataframe
import math

high_income = sum(income['high_income'] == 1)
total = income.shape[0]
high_ratio = high_income / total
low_ratio = 1 - high_ratio
income_entropy = - (high_ratio * math.log(high_ratio, 2) + low_ratio * math.log(low_ratio, 2))

print(income_entropy)

0.796383955202


In [9]:
# Compute the information gain for splitting on the age column of income
import numpy

def calc_entropy(column):
    # Calculate entropy given a pandas Series, list, or numpy array.
    counts = numpy.bincount(column)
    probabilities = counts / len(column)
    
    entropy = 0
    for prob in probabilities:
        entropy += prob * math.log(prob, 2)
    return -entropy

median_age = numpy.median(income['age'])
left_split = income[income['age'] <= median_age]
right_split = income[income['age'] > median_age]

left_entropy = calc_entropy(left_split['high_income'])
right_entropy = calc_entropy(right_split['high_income'])
total_entropy = calc_entropy(income['high_income'])

age_information_gain = total_entropy - (len(left_split) / len(income) * left_entropy + 
                                        len(right_split) / len(income) * right_entropy)
print(age_information_gain)

0.0470286613047


Make a list called information_gains. It should contain, in order, the information gain from splitting on these columns: age, workclass, education_num, marital_status, occupation, relationship, race, sex, hours_per_week, native_country.

Find the highest value in the information_gains list. Assign the name of the column with the highest information gain to highest_gain.

In [11]:
def calc_information_gain(data_set, split_name, target_name):
    # Calculate information gain given a dataset, column to split on, and target.
    median = numpy.median(data_set[split_name])
    left_split = data_set[data_set[split_name] <= median]
    right_split = data_set[data_set[split_name] > median]
    
    left_entropy = calc_entropy(left_split[target_name])
    right_entropy = calc_entropy(right_split[target_name])
    total_entropy = calc_entropy(data_set[target_name])
    
    information_gain = total_entropy - (len(left_split) / len(data_set) * left_entropy +
                                       len(right_split) / len(data_set) * right_entropy)
    return information_gain

columns = ["age", "workclass", "education_num", "marital_status", "occupation", 
           "relationship", "race", "sex", "hours_per_week", "native_country"]
information_gains = []

for col in columns:
    information_gains.append(calc_information_gain(income, col, 'high_income'))

index = information_gains.index(max(information_gains))
highest_gain = columns[index]
print(highest_gain)

marital_status
