In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [2]:
filename = 'cleaned_fraudTrain.csv'

In [3]:
# import Dataset
path = '../dataset/midterm_dataset/'+filename
df = pd.read_csv(path)

row_num = df.shape[0]
feature_num = df.shape[1]

print(df.shape)
df.head()

(1296675, 14)


Unnamed: 0,category,amt,is_male,lat,long,city_pop,job,unix_time,is_fraud,state_city,age,trans_time,trans_year,trans_month
0,misc_net,4.97,0,36.0788,-81.1781,3495,"Psychologist, counselling",1325376018,0,"28654, Moravian Falls, NC",31,Late Night,2019,1
1,grocery_pos,107.23,0,48.8878,-118.2105,149,Special educational needs teacher,1325376044,0,"99160, Orient, WA",41,Late Night,2019,1
2,entertainment,220.11,1,42.1808,-112.262,4154,Nature conservation officer,1325376051,0,"83252, Malad City, ID",57,Late Night,2019,1
3,gas_transport,45.0,1,46.2306,-112.1138,1939,Patent attorney,1325376076,0,"59632, Boulder, MT",52,Late Night,2019,1
4,misc_pos,41.96,1,38.4207,-79.4629,99,Dance movement psychotherapist,1325376186,0,"24433, Doe Hill, VA",33,Late Night,2019,1


In [4]:
label = 'is_fraud'
label

'is_fraud'

In [5]:
features = df.loc[:, df.columns != label].columns.tolist()
print(features)

['category', 'amt', 'is_male', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'state_city', 'age', 'trans_time', 'trans_year', 'trans_month']


<h1> Decision Tree

<h2> Intial Entropy 'is_fraud'

In [532]:
#find intial Entropy 'is_fraud'
print(df[label].value_counts())
p_1_fraud = df[label].value_counts()[1]/len(df[label])
print('Probability Yes :', p_1_fraud)
p_0_fraud = df[label].value_counts()[0]/len(df[label])
print('Probability No :', p_0_fraud)

0    1289169
1       7506
Name: is_fraud, dtype: int64
Probability Yes : 0.005788651743883394
Probability No : 0.9942113482561166


In [533]:
intial_entropy = -p_1_fraud*np.log2(p_1_fraud)-p_0_fraud*np.log2(p_0_fraud)
intial_entropy

0.05135152470435041

In [6]:
def find_prob_each(label, feature, df=df):
    """
    -helper function for group_entropy function
    -and also give probability often its occures compare to the label
    
    return dict of probability of feature
    """

    temp_df = df.groupby([feature, label])
    prob_list = []
    if len(temp_df.size())< 2000 and feature not in ['age', 'lat', 'long', 'city_pop']:  
        for r in range(len(temp_df.size())):
                feature_prob = temp_df.size().values[r]/ \
                    len(df[feature][df[feature] == temp_df.size().index[r][0]])
                prob_feature = {}
                prob_feature[temp_df.size().index[r]] = feature_prob
                featu = ''
                if len(prob_list) == 0:
                    featu = temp_df.size().index[r][0]
                    prob_list.append({featu : [0.0, 0.0]})
                    if temp_df.size().index[r][1] == 1:
                        prob_list[-1][featu][1] = feature_prob
                    else:
                        prob_list[-1][featu][0] = feature_prob
                elif temp_df.size().index[r][0] in prob_list[-1]: #append 2nd value
#                     print(df.groupby([feature, label]).size().index[r][0], r,feature_prob, temp_df.size().index[r][1])
#                     print(prob_list[-1][temp_df.size().index[r][0]])
                    if temp_df.size().index[r][1] == 1:
                        prob_list[-1][temp_df.size().index[r][0]][1] = feature_prob
                    else:
                        prob_list[-1][temp_df.size().index[r][0]][0] = feature_prob
                else:
                    featu = temp_df.size().index[r][0]
                    prob_list.append({featu : [0.0, 0.0]})
                    if temp_df.size().index[r][1] == 1:
                        prob_list[-1][featu][1] = feature_prob
                    else:
                        prob_list[-1][featu][0] = feature_prob
                    
    else: #numeric feature
        greater_than_eq_df = df[[label, feature]][df[feature] >=df[feature].mean()]
        less_than_df = df[[label, feature]][df[feature] < df[feature].mean()]
        condition_dfs = [greater_than_eq_df, less_than_df]
    
        prob_list = [{'greater_than_eq_mean': []}, {'less_than_mean': []}]
    
        for d in condition_dfs:
            for r in range(len(d[label].value_counts())):
                feature_prob = d[label].value_counts()[r]/len(d)
#                 print(feature_prob)
                if  d.equals(greater_than_eq_df):
                    prob_list[0]['greater_than_eq_mean'].append(feature_prob)

                else:
                    prob_list[1]['less_than_mean'].append(feature_prob)
    p_dict = {}
    [p_dict.update(i) for i in prob_list]
    
    return p_dict

In [536]:
import math
def group_entropy(feature, df=df):
    """
    give most pure feature in sorted_feature part
    note: the most pure value = 0
    """
    an_entropy = {}
    prob_dict = find_prob_each(label, feature, df)
    
    for key in prob_dict:
        index0 = prob_dict[key][0]*np.log2(prob_dict[key][0])
        index1 = prob_dict[key][1]*np.log2(prob_dict[key][1])
        if math.isnan(index0): index0 = 0
        if math.isnan(index1): index1 = 0
        an_entropy[key] = -index0 -index1

    output = {
        feature: an_entropy,
        'sorted_feature' : []
    }
    t = [(k, v) for k, v in output[feature].items()]
    t.sort(key=lambda x : x[1])
    output['sorted_feature'] = t

    return output

In [538]:
def info_gain(intial_entropy, feature, df=df):
    """
    float, float, list
    find information gain
    
    output: float
    """
    g_entropy = group_entropy(feature, df)
    total_entropy = 0
    length_df = len(df)
    
    for i in g_entropy[feature]:
        if math.isnan((len(df[feature][df[feature] == i])/length_df) * g_entropy[feature][i]):
            total_entropy += 0
        else:
            total_entropy += (len(df[feature][df[feature] == i])/length_df) * g_entropy[feature][i]
#         print(i)
    return intial_entropy - (total_entropy)

In [540]:
import time
start_time = time.time()
df_ig = {'information_gain': []}

for feature in features:
    df_ig['information_gain'].append(info_gain(intial_entropy, feature))
    print(feature)
df_ig = pd.DataFrame.from_dict(df_ig)
df_ig.index = features
print("--- %s seconds ---" % (time.time() - start_time))
df_ig

category
amt
is_male
lat
long
city_pop


  index0 = prob_dict[key][0]*np.log2(prob_dict[key][0])
  index0 = prob_dict[key][0]*np.log2(prob_dict[key][0])
  index1 = prob_dict[key][1]*np.log2(prob_dict[key][1])
  index1 = prob_dict[key][1]*np.log2(prob_dict[key][1])


job
unix_time


  index0 = prob_dict[key][0]*np.log2(prob_dict[key][0])
  index0 = prob_dict[key][0]*np.log2(prob_dict[key][0])
  index1 = prob_dict[key][1]*np.log2(prob_dict[key][1])
  index1 = prob_dict[key][1]*np.log2(prob_dict[key][1])


state_city
age
trans_time
trans_year
trans_month
--- 441.23811388015747 seconds ---


Unnamed: 0,information_gain
category,0.003107
amt,0.051352
is_male,4.2e-05
lat,0.051352
long,0.051352
city_pop,0.051352
job,0.003134
unix_time,0.051352
state_city,0.007758
age,0.051352


<h2> Information Gain table

In [543]:
df_ig.sort_values(by=['information_gain'], inplace=True, ascending=False)
df_ig

Unnamed: 0,information_gain
amt,0.051352
lat,0.051352
long,0.051352
city_pop,0.051352
unix_time,0.051352
age,0.051352
state_city,0.007758
trans_time,0.005617
job,0.003134
category,0.003107


In [544]:
df_ig.to_csv('info_gain_all_feature.csv')

In [7]:
df_ig = pd.read_csv('info_gain_all_feature.csv')
df_ig = df_ig.rename(columns={'Unnamed: 0' : 'features'})
df_ig.set_index("features", inplace = True)
df_ig

Unnamed: 0_level_0,information_gain
features,Unnamed: 1_level_1
amt,0.051352
lat,0.051352
long,0.051352
city_pop,0.051352
unix_time,0.051352
age,0.051352
state_city,0.007758
trans_time,0.005617
job,0.003134
category,0.003107


<h2> Training

In [8]:
X = df[features]

In [9]:
y = df[label]

In [12]:
#containing dict to decrease workload of model
store_prob_dict = {}

In [13]:
def extract_feature_point(feature, value, df=df, store_prob_dict=store_prob_dict):
    #check type
    if type(value) != str and feature not in ['is_male', 'trans_year', 'trans_month']:
        mean_val = df[feature].mean()
        if value >= mean_val:
            value = 'greater_than_eq_mean'
        else:
            value = 'less_than_mean'
    
    print('value', value)
        
    if feature not in store_prob_dict:
        prob_dict = find_prob_each(label, feature, df)
        store_prob_dict.update({feature : prob_dict})
        print(feature)
        print('store_dict length:', len(store_prob_dict))
        
    
    #append value
    if value not in store_prob_dict[feature]:
        #return 0 when hv no data
        return [0, 0]
    elif value in store_prob_dict[feature]:
        #return point
        return [
            100*store_prob_dict[feature][value][0]*float(df_ig.loc[feature,:]),
            100*store_prob_dict[feature][value][1]*float(df_ig.loc[feature,:])
        ]

In [15]:
def row_desicion_tree_predictor(an_row=[], features=[], df=df, store_prob_dict=store_prob_dict):
    not_fraud = []
    fraud = []
#     an_row = an_df_row.values.tolist()[0]
    for i, feature in enumerate(features):
        point = extract_feature_point(feature, an_row[i], df, store_prob_dict)
        not_fraud.append(point[0])
        fraud.append(point[1])
#         print(feature,':',point,'done')
    print('predicting..')
#     print(1)
    #predicting
    nfra_mean = np.mean(not_fraud)
    fra_mean = np.mean(fraud)
    if nfra_mean >= fra_mean:
        return 0
    else:
        return 1

In [16]:
def desicion_tree_predictor(X, features):
    """
    return list of predicting point
    """
    x_list = X.values.tolist()
    return [row_desicion_tree_predictor(row, features) for row in x_list]

In [None]:
start_time = time.time()
predicted_list = desicion_tree_predictor(X, features)
print("--- %s seconds ---" % (time.time() - start_time))
predicted_list

In [None]:
predicted_list

<H2> Evaluation

In [None]:
# import Dataset
path = '../dataset/midterm_dataset/cleaned_fraudTest.csv'
df_test = pd.read_csv(path)

row_num = df_test.shape[0]
feature_num = df_test.shape[1]

print(df_test.shape)
df_test.head()

In [None]:
X_test[features]
X_test.head()

In [None]:
y_test[label]
y_test.head()

In [None]:
start_time = time.time()
predicted_test_list = desicion_tree_predictor(X_test, features)
print("--- %s seconds ---" % (time.time() - start_time))
predicted_test_list