In [1]:
import csv
from zipfile import ZipFile
from io import TextIOWrapper
import pandas as pd
import random

# for tree building
import os, math
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
from tree import *

reader = ZippedCSVReader("loans.zip")
b = Bank(None, reader)

columns = ["amount", "purpose", "income",
           "American Indian or Alaska Native", "Asian",
           "Black or African American",
           "Native Hawaiian or Other Pacific Islander", "White",
           "decision"]
rows = []
missing_race = set()
for i, loan in enumerate(b.loan_iter()):
    rows.append({c: loan[c] for c in columns})
    if not loan["race"] in columns:
        missing_race.add(loan["race"])
missing_race

{'Information not provided by applicant in mail, Internet, or telephone application',
 'Not applicable'}

# Tree Building

In [16]:
def build_tree(df, outfile, depth=7):
    df = df.copy()

    # convert 'devision' to int
    df['decision'] = df['decision'].apply(lambda x: 1 if x=='approve' else 0)

    # convert amount and income to int
    def convert(x):
        try:
            return int(x)
        except ValueError:
            return np.nan

    # will impute after split
    if "amount" in df.columns:
        df['amount'] = df['amount'].apply(lambda x: convert(x) ) 

    if "income" in df.columns:
        df['income'] = df['income'].apply(lambda x: convert(x) )

    # OneHot encode 'purpose'
    # could also remove this cell and add 'Home improvement', 'Home purchase', 
    #       and 'Refinancing' to column list in place of 'purpose'
    if "purpose" in df.columns:
        enc = OneHotEncoder(handle_unknown='ignore')
        _df = pd.DataFrame(enc.fit_transform(df[['purpose']]).toarray(), columns=enc.categories_)
        _df.columns = [item[0] for item in _df.columns]
        df = df.join(_df)
        df = df.drop(['purpose'], axis=1)

    # split into X, y, and then split into train and test sets
    X, y = df[[col for col in df.columns.tolist() if col!= 'decision']], df['decision']
    X_train, X_test, y_train, y_test = train_test_split(X.copy(), y, test_size=0.2)

    # impute income and split
    if "income" in df.columns:
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        X_train.loc[:, 'income'] = imp.fit_transform(X_train[['income']])
        X_test['income'] = imp.transform(X_test[['income']])

    if "amount" in df.columns:
        imp = SimpleImputer(missing_values=np.nan, strategy='mean')
        X_train['amount'] = imp.fit_transform(X_train[['amount']])
        X_test['amount'] = imp.transform(X_test[['amount']])

    dct1 = tree.DecisionTreeClassifier(max_depth=depth, class_weight="balanced")
    dct1.fit(X_train, y_train)

    # should look normal once y isn't all the same class
    dct_text = tree.export_text(dct1, feature_names=X_train.columns.tolist())

    TextIOWrapper(outfile).write(dct_text)

In [17]:
simple = """
|--- amount <= 200
|   |--- income <= 35
|   |   |--- class: 0
|   |--- income >  35
|   |   |--- class: 1
|--- amount >  200
|   |--- income <= 70
|   |   |--- class: 0
|   |--- income >  70
|   |   |--- class: 1
""".strip()

with ZipFile("trees-new.zip", "w") as zf:
    with zf.open("simple.txt", "w") as f:
        TextIOWrapper(f).write(simple)

    df = pd.DataFrame(random.sample(rows, 10000))

    with zf.open("good.txt", "w") as f:
        build_tree(df[["amount", "purpose", "income", "decision"]], f, 5)

    with zf.open("bad.txt", "w") as f:
        build_tree(df, f, 5)
        
    for i in range(7):
        df = pd.DataFrame(random.sample(rows, 1000))
        with zf.open(f"tree{i+1}.txt", "w") as f:
            cols = random.sample(["amount", "purpose", "income"], 2) + ["decision"]
            print(cols)
            build_tree(df[cols], f, 3)

['income', 'purpose', 'decision']
['purpose', 'income', 'decision']
['amount', 'income', 'decision']
['amount', 'purpose', 'decision']
['purpose', 'income', 'decision']
['amount', 'purpose', 'decision']
['amount', 'purpose', 'decision']


In [18]:
df.head()

Unnamed: 0,amount,purpose,income,American Indian or Alaska Native,Asian,Black or African American,Native Hawaiian or Other Pacific Islander,White,decision
0,241,Home purchase,48,0,0,0,0,1,approve
1,140,Refinancing,35,0,0,0,0,1,approve
2,46,Refinancing,33,0,0,0,0,1,approve
3,134,Home improvement,75,0,0,0,0,1,deny
4,22,Home purchase,0,0,0,0,0,0,approve
