# Machine Learning: Decision Tree

In [28]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
import string
import re # helps you filter urls
from IPython.display import display, Latex, Markdown

In [16]:
# gun violence cases dataset cleaned
cases = pd.read_csv('stage3.csv')
cases = cases.loc[:, ['date', 'state', 'n_killed', 'n_injured']]

cases['date'] = pd.to_datetime(cases['date'])
cases['year'] = cases['date'].dt.year
cases['harmed'] = cases['n_killed'] + cases['n_injured']
cases = cases.drop(columns=['date', 'n_killed', 'n_injured'])
cases = cases[(cases['year'] > 2013) & (cases['year'] < 2018)]
cases = cases.groupby(['state', 'year']).agg('sum')
cases = cases.reset_index()

# Check if gun cases were reduced based on previous year values (only valid for 2015-2017)
harmed = cases['harmed'].tolist()
change = list()

# True = Reduce, False = Increase
for i in range(0, len(harmed), 4):
    change.append(False)                           # Year 2014; Temporary Place-Holder
    change.append(harmed[i] > harmed[i+1])         # Year 2015
    change.append(harmed[i+1] > harmed[i+2])       # Year 2016
    change.append(harmed[i+2] > harmed[i+3])       # Year 2017

cases['change'] = pd.Series(change)
cases = cases.drop(labels=range(0, len(change), 4), axis=0)
cases = cases[(cases['state'] != 'District of Columbia')]
cases.head(5)

Unnamed: 0,state,year,harmed,change
1,Alabama,2015,947,False
2,Alabama,2016,1249,False
3,Alabama,2017,1400,False
5,Alaska,2015,154,False
6,Alaska,2016,191,False


In [26]:
# gun laws dataset cleaned
laws = pd.read_csv('DATABASE_0.csv')
laws = laws[(laws['year'] > 2014) & (laws['year'] < 2018)]
laws.head(5)

Unnamed: 0,state,year,felony,invcommitment,invoutpatient,danger,drugmisdemeanor,alctreatment,alcoholism,relinquishment,...,expartedating,dvrosurrender,dvrosurrendernoconditions,dvrosurrenderdating,expartesurrender,expartesurrendernoconditions,expartesurrenderdating,dvroremoval,stalking,lawtotal
24,Alabama,2015,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,10
25,Alabama,2016,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,10
26,Alabama,2017,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,10
54,Alaska,2015,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
55,Alaska,2016,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [27]:
# Combined dataset
data = cases.merge(laws, on=['state', 'year'], how='outer')
data = data.drop(columns=['year', 'harmed', 'lawtotal'])
data.head(5)

Unnamed: 0,state,change,felony,invcommitment,invoutpatient,danger,drugmisdemeanor,alctreatment,alcoholism,relinquishment,...,exparte,expartedating,dvrosurrender,dvrosurrendernoconditions,dvrosurrenderdating,expartesurrender,expartesurrendernoconditions,expartesurrenderdating,dvroremoval,stalking
0,Alabama,False,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Alabama,False,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Alabama,False,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Alaska,False,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alaska,False,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Create the features and labels for classification.

In [35]:
def create_features_labels(dataset):
    X = dataset.iloc[:, 2:]    # Features are all categories of gun laws
    y = dataset['change']      # Labels are if the number of cases reduced
    y = y.replace({True: 1, False: 0})
    return X, y

X, y = create_features_labels(data)

Create a baseline classifier, DecisionTreeClassifier, to test our classifier against.

In [17]:
# A baseline classifier to predict the mode of the training labels
class DecisionTreeClassifier():
    # Initialize parameter for the classifier
    def __init__(self):
        self.mode = 0
    
    # Fit the data by taking training data X and their labels y and
    # storing the learned parameter
    def fit(self, X, y):
        self.mode = 0
    
    # Predict the label for each instance X as the learned parameter
    def predict(self, X):
        labels = list()
        for i in X:
            labels.append(self.parameter)
        return labels
    
    # Calculate the accuracy of our classifier using
    # the true and predicted labels
    def evaluate_accuracy(self, y, y_predict):
        accurate_pred = 0
        total = len(y_predict)
        true_labels = y.tolist()
        
        for i in range(total):
            if true_labels[i] == y_pred[i]:
                accurate_pred += 1
        return accurate_pred/total

In [None]:
baseline = DecisionTreeClassifier()
baseline.fit(X, y)
labels = baseline.predict(X)
print(evaluate_accuracy(y, labels))