In [7]:
import pandas as pd
import sys

sys.path.append("../")

from src.information_gain import (
    calculate_information_gain,
    calculate_information_gain_ratio,
)

In [2]:
# read data
df = pd.read_csv("../data/casestudy_data.csv")
df.head(3)


Unnamed: 0,visit_id,date_time,experience,success,zipcode,pro,repeat_visit,days_since_last_visit,new_movers,year_home_built,home_market_value,length_of_residence,net_worth,income,mkt_organic_product_purchasers_value,mkt_trend_env_focused_hh_value,high_end_shoppers_value,do_it_yourselfer_value,montrd_home_security_sys_own_value,mkt_green_product_purchasers_value
0,14167,5/31/19 10:36,version1,0,30080,0,0,,0,1987.0,320.0,8.0,250000.0,100000.0,29.0,25.0,25.0,55.0,28.0,39.0
1,195581,5/31/19 10:37,version1,0,32081,1,0,,0,2006.0,366.0,4.0,250000.0,125000.0,40.0,28.0,23.0,60.0,18.0,46.0
2,451142,5/31/19 10:37,version2,0,91124,0,0,,0,1988.0,504.0,13.0,500000.0,125000.0,21.0,12.0,16.0,76.0,32.0,31.0


In [26]:
# check the column names
df.columns


Index(['visit_id', 'date_time', 'experience', 'success', 'zipcode', 'pro',
       'repeat_visit', 'days_since_last_visit', 'new_movers',
       'year_home_built', 'home_market_value', 'length_of_residence',
       'net_worth', 'income', 'mkt_organic_product_purchasers_value',
       'mkt_trend_env_focused_hh_value', 'high_end_shoppers_value',
       'do_it_yourselfer_value', 'montrd_home_security_sys_own_value',
       'mkt_green_product_purchasers_value'],
      dtype='object')

In [4]:
# Check duplicates --> no duplicates
print("size before dropping duplicates:", df.shape)
print("size after dropping duplicates:", df.drop_duplicates().shape)


size before dropping duplicates: (513863, 20)
size after dropping duplicates: (513863, 20)


In [24]:
# Check null value ratio of each column
null_value_ratio = [(sum(df[c].isna()) / df.shape[0], c) for c in df.columns]
null_value_ratio.sort(reverse=True)
null_value_ratio

[(0.20985165306706263, 'days_since_last_visit'),
 (0.023278578142423175, 'year_home_built'),
 (0.023278578142423175, 'net_worth'),
 (0.023278578142423175, 'montrd_home_security_sys_own_value'),
 (0.023278578142423175, 'mkt_trend_env_focused_hh_value'),
 (0.023278578142423175, 'mkt_organic_product_purchasers_value'),
 (0.023278578142423175, 'mkt_green_product_purchasers_value'),
 (0.023278578142423175, 'length_of_residence'),
 (0.023278578142423175, 'income'),
 (0.023278578142423175, 'home_market_value'),
 (0.023278578142423175, 'high_end_shoppers_value'),
 (0.023278578142423175, 'do_it_yourselfer_value'),
 (0.0014926157361008674, 'zipcode'),
 (0.0, 'visit_id'),
 (0.0, 'success'),
 (0.0, 'repeat_visit'),
 (0.0, 'pro'),
 (0.0, 'new_movers'),
 (0.0, 'experience'),
 (0.0, 'date_time')]

In [28]:
# Check value distribution of each column
for c in df.columns:
    print(df[c].value_counts(), f"{c}\n")


14167     1
191702    1
331337    1
148651    1
458868    1
         ..
254749    1
513800    1
318450    1
257197    1
454425    1
Name: split, Length: 513863, dtype: int64 visit_id

6/24/19 13:23    51
6/27/19 12:41    47
6/27/19 9:29     47
6/28/19 11:20    47
6/27/19 13:53    47
                 ..
6/16/19 1:23      1
6/11/19 1:00      1
6/16/19 1:16      1
6/16/19 1:15      1
5/31/19 10:36     1
Name: split, Length: 43182, dtype: int64 date_time

version2    257284
version1    256579
Name: split, dtype: int64 experience

0    488312
1     25551
Name: members, dtype: int64 success

30080      17902
97229      10764
30339       6014
7205        4921
94043       4178
           ...  
12828          1
w1j 0aa        1
56331          1
33190          1
42408          1
Name: split, Length: 15038, dtype: int64 zipcode

0    421771
1     92092
Name: split, dtype: int64 pro

1    416487
0     97376
Name: split, dtype: int64 repeat_visit

less than 1 day      230358
less than 7 days      9

In [21]:
# check the information gain between column `success` and all other columns
col_y = "success"
cols_x = list(set(df.columns) - set([col_y]))

information_gain = [
    (round(calculate_information_gain(members=df[col_y], split=df[c]), 4), c)
    for c in cols_x
]
information_gain.sort(reverse=True)
information_gain


[(0.1977, 'visit_id'),
 (0.11, 'income'),
 (0.0739, 'date_time'),
 (0.0619, 'zipcode'),
 (0.058, 'year_home_built'),
 (0.0336, 'days_since_last_visit'),
 (0.0233, 'home_market_value'),
 (0.0129, 'length_of_residence'),
 (0.0078, 'net_worth'),
 (0.0065, 'mkt_organic_product_purchasers_value'),
 (0.0041, 'mkt_green_product_purchasers_value'),
 (0.001, 'pro'),
 (0.0006, 'repeat_visit'),
 (0.0, 'new_movers'),
 (0.0, 'experience'),
 (-0.0008, 'montrd_home_security_sys_own_value'),
 (-0.0056, 'mkt_trend_env_focused_hh_value'),
 (-0.0073, 'do_it_yourselfer_value'),
 (-0.0123, 'high_end_shoppers_value')]

In [22]:
# check the information gain ratio between column `success` and all other columns
col_y = "success"
cols_x = list(set(df.columns) - set([col_y]))

information_gain_ratio = [
    (round(calculate_information_gain_ratio(members=df[col_y], split=df[c]), 4), c)
    for c in cols_x
]
information_gain_ratio.sort(reverse=True)
information_gain_ratio


[(0.0742, 'income'),
 (0.0301, 'days_since_last_visit'),
 (0.0151, 'year_home_built'),
 (0.015, 'visit_id'),
 (0.0145, 'length_of_residence'),
 (0.0074, 'zipcode'),
 (0.0071, 'date_time'),
 (0.0059, 'net_worth'),
 (0.0039, 'home_market_value'),
 (0.0022, 'pro'),
 (0.0017, 'mkt_organic_product_purchasers_value'),
 (0.0012, 'repeat_visit'),
 (0.0011, 'mkt_green_product_purchasers_value'),
 (0.0, 'new_movers'),
 (0.0, 'experience'),
 (-0.0002, 'montrd_home_security_sys_own_value'),
 (-0.0014, 'mkt_trend_env_focused_hh_value'),
 (-0.0019, 'do_it_yourselfer_value'),
 (-0.0034, 'high_end_shoppers_value')]

In [12]:
# check the distribution of experience in subsets: success==0 and success==1
for v in [0, 1]:
    print(df[df.success==v].experience.value_counts(), f"success={v}\n")

version2    244505
version1    243807
Name: experience, dtype: int64 success=0

version2    12779
version1    12772
Name: experience, dtype: int64 success=1

