In [12]:
### Chicago crime analysis
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [5]:
seeds = np.random.seed(1234)

def split_data(train_data, test_data, num):
    x_train, x_test, y_train, y_test = train_test_split(tr, test_data, test_size=num, random_state=seeds)
    return x, x_test, y_train, y_test

In [6]:
## kfold cross validation

def kfold_cv(x_train, y_train):
    acc=[]
    depth = range(1,20)
    for i in depth:
        model = tree.DecisionTreeClassifier(max_depth=i)
        model.fit(x_train, y_train)
        acc.append(np.mean(cross_val_score(model, x_train, y_train, cv=5)))
    depth_ = depth[np.argmax(acc)]
    return depth_

In [9]:
def cv(num, model, x_test, y_test):
    """
    this function is going to examine model accuracy based on the given test set
    """
    cv_score = cross_val_score(model, x_test, y_test, cv=num)
    cv_score = np.mean(cv_score)
    cv_df = pd.DataFrame({"index": list(range(1, n+1)), "cross validation score": cv_score})
    return cv_df

In [11]:
### build decision tree

def decision_tree(feature_columns, df):
    """
    this function takes a dataframe and an array for feature,
    fits a decision tree model with max depth as 4, and provides prediction results for the test set
    """
    x = df.loc[: feature_columns]
    y = df.Arrest
    x_train, x_test, y_train, y_test = split_data(x,y, 0.1)
    
    # fit decision tree
    model = tree.DecisionTreeClassifier(max_depth=4)
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    
    ## prediction summary
    pred_dict = x_test.copy.deepcopy()
    pred_dict['target']=y_test
    pred_dict['prediction']=pred
    
    cv_df = cv(10, model, x_test, y_test)
    kfold_cv(x_train, y_train)
    return pred_dict, model, cv_df


In [15]:
## feature importance

# https://datascience.stackexchange.com/questions/6683/feature-selection-using-feature-importances-in-random-forests-with-scikit-learn?rq=1

def feature_importance(model, feature_columns):
    feature = model.tree_.compute_feature_importances(normalize=False)
    feature_df = pd.DataFrame([feature], columns= feature_columns)
    return feature_df

"""
another way to select importance features

def selectKImportance(model, X, k=5):
     return X[:,model.feature_importances_.argsort()[::-1][:k]]

"""

'\nanother way to select importance features\n\ndef selectKImportance(model, X, k=5):\n     return X[:,model.feature_importances_.argsort()[::-1][:k]]\n\n'