In [1]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from datetime import datetime
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Load Data

In [2]:
def gsev(val): 
    """
    Records whether or not a number is greater than 7. 
    """
    if val <= 7: 
        return 0
    else: 
        return 1

df = pd.read_csv('../../fservice.csv')
df['Just Date'] = df['Just Date'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
df['Seven'] = df['ElapsedDays'].apply(gsev, 0)

  interactivity=interactivity, compiler=compiler, result=result)


# Parameters 

In [3]:
c = ['Anonymous','AssignTo', 'RequestType', 'RequestSource','CD','Direction', 'ActionTaken', 'APC' ,'AddressVerified']
d = ['Latitude', 'Longitude']

# Feature Cleaning 

In [4]:
#Put desired columns into dataframe, drop nulls. 
dfn = df.filter(items = c + d + ['ElapsedDays'] + ['Seven'])
dfn = dfn.dropna()
    
#Separate data into explanatory and response variables
XCAT = dfn.filter(items = c).values
XNUM = dfn.filter(items = d).values
    
y = dfn['ElapsedDays'] <= 7
    
#Encode cateogrical data and merge with numerical data
labelencoder_X = LabelEncoder()
for num in range(len(c)): 
    XCAT[:, num] = labelencoder_X.fit_transform(XCAT[:, num])
            
onehotencoder = OneHotEncoder()
XCAT = onehotencoder.fit_transform(XCAT).toarray()
    
X = np.concatenate((XCAT, XNUM), axis=1)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


# Algorithms and Hyperparameters

In [5]:
##Used Random Forest in Final Model 

gnb = GaussianNB()
dc = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 20)
rf = RandomForestClassifier(n_estimators = 50, max_depth = 20)
lr = LogisticRegression()

# Validation Set

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

#Train Model
classifier = rf

classifier.fit(X_train, y_train)

#Test model
y_vpred = classifier.predict(X_val)

#Print Accuracy Function results
print("Accuracy:",metrics.accuracy_score(y_val, y_vpred))
print("Precision, Recall, F1Score:",metrics.precision_recall_fscore_support(y_val, y_vpred, average = 'binary'))

Accuracy: 0.9385983549336814
Precision, Recall, F1Score: (0.946896616482519, 0.9893259382317161, 0.9676463908853341, None)


# Test Set

In [7]:
#Train Model

#Test model
y_tpred = classifier.predict(X_test)

#Print Accuracy Function results

print("Accuracy:",metrics.accuracy_score(y_test, y_tpred))
print("Precision, Recall, F1Score:",metrics.precision_recall_fscore_support(y_test, y_tpred, average = 'binary'))

Accuracy: 0.9387186223709323
Precision, Recall, F1Score: (0.9468199376863904, 0.9895874917412928, 0.9677314319565967, None)
