# Logit Classification Model
The objective of this notebook is to build logit models to classify crime incidences into: drug violation or not or medical assistance or not medical assistance.

In [1]:
#this code makes all output to be shown
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
crime = pd.read_csv("C:\\Users\\willi\\Dropbox\\working\\RAW_DATA\\Boston Crime\\crime.csv", encoding ='raw_unicode_escape')
crime.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


#### Map of Police Districts in Boston
One of the variables in the data frame is "DISTRICT" which is shown here:
<img src="files/BPD_locations.jpg">

In [3]:
crime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319073 entries, 0 to 319072
Data columns (total 17 columns):
INCIDENT_NUMBER        319073 non-null object
OFFENSE_CODE           319073 non-null int64
OFFENSE_CODE_GROUP     319073 non-null object
OFFENSE_DESCRIPTION    319073 non-null object
DISTRICT               317308 non-null object
REPORTING_AREA         319073 non-null object
SHOOTING               1019 non-null object
OCCURRED_ON_DATE       319073 non-null object
YEAR                   319073 non-null int64
MONTH                  319073 non-null int64
DAY_OF_WEEK            319073 non-null object
HOUR                   319073 non-null int64
UCR_PART               318983 non-null object
STREET                 308202 non-null object
Lat                    299074 non-null float64
Long                   299074 non-null float64
Location               319073 non-null object
dtypes: float64(2), int64(4), object(11)
memory usage: 41.4+ MB


In [55]:
pd.crosstab(index=crime["YEAR"], columns=crime["DISTRICT"])

DISTRICT,A1,A15,A7,B2,B3,C11,C6,D14,D4,E13,E18,E5
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015,6015,1027,2426,8687,5617,7364,3941,3280,7204,2801,2740,2158
2016,10923,1986,4130,15706,11145,13603,7073,6279,12953,5559,5223,4017
2017,11375,2167,4264,15680,11195,13281,7247,6509,13157,5514,5612,4309
2018,7404,1325,2724,9872,7485,8282,5199,4059,8601,3662,3773,2755


In [3]:
#how many years do we have?
pd.crosstab(index=crime["YEAR"], columns="count")
#2015 till 2018

col_0,count
YEAR,Unnamed: 1_level_1
2015,53388
2016,99114
2017,100886
2018,65685


###### Idea for later
For each district or subarea (windows) we can make a time series counting the occurence of crimes, and model them for example using a Bayesian estimation. For example using discrete time series.

###### Take only 2015 for analysis

In [4]:
crime2015 = crime[crime.YEAR==2015]

In [5]:
size = crime2015.shape[0]
size #size of sample
size/10 #size of test set

53388

5338.8

In [6]:
sequence = np.arange(0,size)
sequence

array([    0,     1,     2, ..., 53385, 53386, 53387])

In [7]:
seq2 = np.ndarray.tolist(sequence)
seq2

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [8]:
type(seq2)

list

In [9]:
type(sequence)

numpy.ndarray

In [10]:
k = size/10
k = int(k)
k

5338

In [12]:
import random 
random.seed(9001)
random_sample = random.sample(seq2, k)
#random_sample = random.sample([1,2,3], 1)
random_sample

[2431,
 19774,
 16585,
 4470,
 36962,
 16442,
 37308,
 39723,
 33622,
 3545,
 46701,
 43282,
 17663,
 40337,
 41410,
 35270,
 41557,
 49710,
 50845,
 23222,
 19087,
 38575,
 33720,
 4865,
 35390,
 29105,
 47746,
 50409,
 14432,
 39912,
 29878,
 48441,
 25691,
 52505,
 14609,
 16209,
 32215,
 8237,
 20029,
 14499,
 10695,
 25694,
 41615,
 24607,
 34530,
 34075,
 18221,
 11316,
 19827,
 32454,
 8157,
 51447,
 25967,
 41631,
 33692,
 174,
 45410,
 14146,
 41414,
 363,
 31809,
 40356,
 47380,
 32809,
 18266,
 2846,
 27947,
 36139,
 22880,
 7490,
 37358,
 46696,
 44918,
 15607,
 23646,
 33854,
 46426,
 46054,
 18856,
 19149,
 24181,
 787,
 47282,
 48065,
 33480,
 18966,
 49755,
 28322,
 3893,
 20200,
 51432,
 27534,
 38885,
 28823,
 7673,
 12560,
 52467,
 50684,
 32482,
 42186,
 7235,
 1780,
 52042,
 34312,
 5166,
 9141,
 43219,
 52873,
 23287,
 21204,
 24796,
 15280,
 34762,
 20092,
 44330,
 48725,
 8216,
 16815,
 6474,
 27025,
 28044,
 33526,
 49983,
 27811,
 12029,
 41342,
 48113,
 43593

In [13]:
len(random_sample)

5338

In [15]:
#create vector saying 1 for test and 0 for training
testvec = np.zeros(size)
testvec[random_sample] = 1

In [16]:
type(testvec)

numpy.ndarray

In [17]:
#transform into boolean values
testvec = testvec>0

In [18]:
pd.crosstab(index=testvec, columns="count")

col_0,count
row_0,Unnamed: 1_level_1
False,48050
True,5338


In [19]:
#make vector for training set
trainvec = ~testvec

In [21]:
pd.crosstab(index=trainvec, columns="count")

col_0,count
row_0,Unnamed: 1_level_1
False,5338
True,48050


In [22]:
#create training and test datasets
crime2015train = crime2015[trainvec]
crime2015test = crime2015[testvec]
crime2015train.shape
crime2015test.shape

(48050, 17)

(5338, 17)

In [23]:
crime2015.head(50)

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
4363,I182066132,2629,Harassment,HARASSMENT,B3,455.0,,2015-07-31 23:27:00,2015,7,Friday,23,Part Two,RADCLIFFE ST,42.300202,-71.078354,"(42.30020194, -71.07835353)"
8793,I182061268,3201,Property Lost,PROPERTY - LOST,,,,2015-06-15 00:00:00,2015,6,Monday,0,Part Three,BERNARD,-1.0,-1.0,"(-1.00000000, -1.00000000)"
14715,I182054888,2647,Other,THREATS TO DO BODILY HARM,B2,326.0,,2015-07-12 15:37:00,2015,7,Sunday,15,Part Two,FAYSTON ST,42.312243,-71.075499,"(42.31224327, -71.07549901)"
16645,I182052842,1102,Fraud,FRAUD - FALSE PRETENSE / SCHEME,D4,619.0,,2015-12-20 14:00:00,2015,12,Sunday,14,Part Two,WESTLAND AVE,42.343917,-71.089675,"(42.34391716, -71.08967541)"
16646,I182052842,619,Larceny,LARCENY ALL OTHERS,D4,619.0,,2015-12-20 14:00:00,2015,12,Sunday,14,Part One,WESTLAND AVE,42.343917,-71.089675,"(42.34391716, -71.08967541)"
24888,I182044114,1107,Fraud,FRAUD - IMPERSONATION,E18,486.0,,2015-07-01 12:00:00,2015,7,Wednesday,12,Part Two,OAKCREST RD,42.264521,-71.104292,"(42.26452149, -71.10429211)"
29257,I182039429,1107,Fraud,FRAUD - IMPERSONATION,C6,226.0,,2015-11-26 08:00:00,2015,11,Thursday,8,Part Two,E FOURTH ST,42.334717,-71.038678,"(42.33471684, -71.03867801)"
35652,I182032622,2647,Other,THREATS TO DO BODILY HARM,D4,129.0,,2015-10-10 13:17:00,2015,10,Saturday,13,Part Two,BERKELEY ST,42.349976,-71.072426,"(42.34997636, -71.07242619)"
36860,I182031354,1102,Fraud,FRAUD - FALSE PRETENSE / SCHEME,D14,791.0,,2015-09-15 11:00:00,2015,9,Tuesday,11,Part Two,ALLSTON ST,42.344388,-71.140586,"(42.34438811, -71.14058580)"
50414,I182016943,3201,Property Lost,PROPERTY - LOST,C11,366.0,,2015-08-20 08:00:00,2015,8,Thursday,8,Part Three,EDWIN ST,42.289433,-71.060522,"(42.28943324, -71.06052190)"


In [24]:
crime2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53388 entries, 4363 to 319072
Data columns (total 17 columns):
INCIDENT_NUMBER        53388 non-null object
OFFENSE_CODE           53388 non-null int64
OFFENSE_CODE_GROUP     53388 non-null object
OFFENSE_DESCRIPTION    53388 non-null object
DISTRICT               53260 non-null object
REPORTING_AREA         53388 non-null object
SHOOTING               185 non-null object
OCCURRED_ON_DATE       53388 non-null object
YEAR                   53388 non-null int64
MONTH                  53388 non-null int64
DAY_OF_WEEK            53388 non-null object
HOUR                   53388 non-null int64
UCR_PART               53387 non-null object
STREET                 52106 non-null object
Lat                    51207 non-null float64
Long                   51207 non-null float64
Location               53388 non-null object
dtypes: float64(2), int64(4), object(11)
memory usage: 7.3+ MB


In [25]:
#the 20250 dont have reporting are, not very useful information
pd.crosstab(index=crime["REPORTING_AREA"], columns="count")

col_0,count
REPORTING_AREA,Unnamed: 1_level_1
,20250
000,13
1,36
10,189
100,352
...,...
961,32
962,59
97,351
98,353


In [26]:
#create hour dummy
hour_dummy = pd.get_dummies(crime2015["HOUR"])
hour_dummy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
4363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8793,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14715,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
16645,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16646,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [27]:
hour_dummy.columns = ["h0",	"h1",	"h2",	"h3",	"h4",	"h5",	"h6",	"h7",	"h8",	"h9",	"h10",	"h11",	"h12",	"h13",	"h14",	"h15",	"h16",	"h17",	"h18",	"h19",	"h20",	"h21",	"h22",	"h23"] 
hour_dummy.head()

Unnamed: 0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,h14,h15,h16,h17,h18,h19,h20,h21,h22,h23
4363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8793,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14715,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
16645,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16646,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [28]:
pd.crosstab(index=crime2015["HOUR"], columns="count")

col_0,count
HOUR,Unnamed: 1_level_1
0,2752
1,1643
2,1330
3,803
4,539
5,567
6,802
7,1476
8,2095
9,2527


In [29]:
#create dummies for month
month_dummy = pd.get_dummies(crime2015["MONTH"])
month_dummy.columns = ["m06",	"m07",	"m08",	"m09",	"m10",	"m11",	"m12"] 
month_dummy.head()
#there are only data begining in 06, june

Unnamed: 0,m06,m07,m08,m09,m10,m11,m12
4363,0,1,0,0,0,0,0
8793,1,0,0,0,0,0,0
14715,0,1,0,0,0,0,0
16645,0,0,0,0,0,0,1
16646,0,0,0,0,0,0,1


In [30]:
#create dummies for day of the week
weekdays_dummy = pd.get_dummies(crime2015["DAY_OF_WEEK"])
weekdays_dummy.head(5)

Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
4363,1,0,0,0,0,0,0
8793,0,1,0,0,0,0,0
14715,0,0,0,1,0,0,0
16645,0,0,0,1,0,0,0
16646,0,0,0,1,0,0,0


In [31]:
#create dummies for district dummies
district_dummy = pd.get_dummies(crime2015["DISTRICT"])
district_dummy.head()

Unnamed: 0,A1,A15,A7,B2,B3,C11,C6,D14,D4,E13,E18,E5
4363,0,0,0,0,1,0,0,0,0,0,0,0
8793,0,0,0,0,0,0,0,0,0,0,0,0
14715,0,0,0,1,0,0,0,0,0,0,0,0
16645,0,0,0,0,0,0,0,0,1,0,0,0
16646,0,0,0,0,0,0,0,0,1,0,0,0


In [32]:
#create dummies for crime type
crime_dummy = pd.get_dummies(crime2015["OFFENSE_CODE_GROUP"])
crime_dummy.head()

Unnamed: 0,Aggravated Assault,Aircraft,Arson,Assembly or Gathering Violations,Auto Theft,Auto Theft Recovery,Ballistics,Bomb Hoax,Burglary - No Property Taken,Commercial Burglary,...,Restraining Order Violations,Robbery,Search Warrants,Service,Simple Assault,Towed,Vandalism,Verbal Disputes,Violations,Warrant Arrests
4363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8793,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16645,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16646,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
X1 = pd.concat([district_dummy, month_dummy, weekdays_dummy,hour_dummy], axis=1)
X1['intercept'] = 1.0  # so we don't need to use sm.add_constant every time
X1.shape
X1.head()

(53388, 51)

Unnamed: 0,A1,A15,A7,B2,B3,C11,C6,D14,D4,E13,...,h15,h16,h17,h18,h19,h20,h21,h22,h23,intercept
4363,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1.0
8793,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
14715,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1.0
16645,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1.0
16646,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1.0


In [34]:
X1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53388 entries, 4363 to 319072
Data columns (total 51 columns):
A1           53388 non-null uint8
A15          53388 non-null uint8
A7           53388 non-null uint8
B2           53388 non-null uint8
B3           53388 non-null uint8
C11          53388 non-null uint8
C6           53388 non-null uint8
D14          53388 non-null uint8
D4           53388 non-null uint8
E13          53388 non-null uint8
E18          53388 non-null uint8
E5           53388 non-null uint8
m06          53388 non-null uint8
m07          53388 non-null uint8
m08          53388 non-null uint8
m09          53388 non-null uint8
m10          53388 non-null uint8
m11          53388 non-null uint8
m12          53388 non-null uint8
Friday       53388 non-null uint8
Monday       53388 non-null uint8
Saturday     53388 non-null uint8
Sunday       53388 non-null uint8
Thursday     53388 non-null uint8
Tuesday      53388 non-null uint8
Wednesday    53388 non-null uint8
h0 

In [35]:
X2 = X1.drop(['Sunday','A1','m06','h0'],  axis=1)
X2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53388 entries, 4363 to 319072
Data columns (total 47 columns):
A15          53388 non-null uint8
A7           53388 non-null uint8
B2           53388 non-null uint8
B3           53388 non-null uint8
C11          53388 non-null uint8
C6           53388 non-null uint8
D14          53388 non-null uint8
D4           53388 non-null uint8
E13          53388 non-null uint8
E18          53388 non-null uint8
E5           53388 non-null uint8
m07          53388 non-null uint8
m08          53388 non-null uint8
m09          53388 non-null uint8
m10          53388 non-null uint8
m11          53388 non-null uint8
m12          53388 non-null uint8
Friday       53388 non-null uint8
Monday       53388 non-null uint8
Saturday     53388 non-null uint8
Thursday     53388 non-null uint8
Tuesday      53388 non-null uint8
Wednesday    53388 non-null uint8
h1           53388 non-null uint8
h2           53388 non-null uint8
h3           53388 non-null uint8
h4 

In [36]:
import scipy as sp
import statsmodels.api as sm
import statsmodels

In [37]:
X2train = X2[trainvec]
ytrain = crime_dummy["Drug Violation"][trainvec]

X2test = X2[testvec]
ytest_dv = crime_dummy["Drug Violation"][testvec] #Drug Violation (dv)

In [39]:
model = sm.Logit(ytrain,X2train)
result_dv = model.fit()
result_dv.summary()

  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully.
         Current function value: inf
         Iterations 8




0,1,2,3
Dep. Variable:,Drug Violation,No. Observations:,48050.0
Model:,Logit,Df Residuals:,48003.0
Method:,MLE,Df Model:,46.0
Date:,"Fri, 13 Sep 2019",Pseudo R-squ.:,inf
Time:,10:57:15,Log-Likelihood:,-inf
converged:,True,LL-Null:,0.0
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
A15,0.0123,0.139,0.088,0.930,-0.261,0.285
A7,0.2508,0.093,2.694,0.007,0.068,0.433
B2,-0.3467,0.074,-4.681,0.000,-0.492,-0.202
B3,-0.3294,0.083,-3.958,0.000,-0.492,-0.166
C11,0.0441,0.072,0.614,0.539,-0.097,0.185
C6,0.3063,0.080,3.851,0.000,0.150,0.462
D14,-0.8931,0.123,-7.274,0.000,-1.134,-0.652
D4,-0.4929,0.080,-6.140,0.000,-0.650,-0.336
E13,0.0515,0.093,0.553,0.580,-0.131,0.234


In [40]:
X2train = X2[trainvec]
ytrain = crime_dummy["Medical Assistance"][trainvec]

#X2test = X2[testvec]
ytest_ma = crime_dummy["Medical Assistance"][testvec] #medical assistance (ma)

In [41]:
#model = sm.Logit(ytrain,X2train)
#change argument for optimization
model = sm.Logit(ytrain,X2train)
result_ma = model.fit()
result_ma.summary()

  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully.
         Current function value: inf
         Iterations 7




0,1,2,3
Dep. Variable:,Medical Assistance,No. Observations:,48050.0
Model:,Logit,Df Residuals:,48003.0
Method:,MLE,Df Model:,46.0
Date:,"Fri, 13 Sep 2019",Pseudo R-squ.:,inf
Time:,11:04:31,Log-Likelihood:,-inf
converged:,True,LL-Null:,0.0
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
A15,0.5559,0.149,3.724,0.000,0.263,0.849
A7,0.6991,0.107,6.545,0.000,0.490,0.908
B2,0.3450,0.086,4.012,0.000,0.176,0.514
B3,0.3533,0.094,3.778,0.000,0.170,0.537
C11,0.6027,0.085,7.077,0.000,0.436,0.770
C6,0.5534,0.098,5.665,0.000,0.362,0.745
D14,0.6404,0.100,6.395,0.000,0.444,0.837
D4,0.0976,0.093,1.047,0.295,-0.085,0.280
E13,0.3050,0.114,2.676,0.007,0.082,0.528


###### Idea apply later
We can use count data of each region and make a simple VAR with daily data of types of crime

In [42]:
#define function for prediction
def logPredict(modelParams, X):  
    probabilities = modelParams.predict(X)
    return [1 if x >= 0.05 else 0 for x in probabilities]

In [43]:
predictions_dv = logPredict(result_dv, X2test)
predictions_ma = logPredict(result_ma, X2test)

In [44]:
predictions_dv_array = np.asarray(predictions_dv)
predictions_ma_array = np.asarray(predictions_ma)

In [45]:
pd.crosstab(index=predictions_dv_array, columns="count")
pd.crosstab(index=predictions_ma_array, columns="count")

col_0,count
row_0,Unnamed: 1_level_1
0,2654
1,2684


col_0,count
row_0,Unnamed: 1_level_1
0,1904
1,3434


### Accuracy
It’s the ratio of the correctly labeled subjects to the whole pool of subjects.

Accuracy is the most intuitive one.

Accuracy answers the following question: How many crime incidence did we correctly label out of all the crime incidences?

Accuracy = (TP+TN)/(TP+FP+FN+TN)

numerator: all correctly labeled subject (All trues)

denominator: all subjects

In [46]:
accuracy = np.mean(predictions_dv == ytest_dv)
print ('Drug Violation model accuracy = {0}%'.format(accuracy*100))
accuracy = np.mean(predictions_ma == ytest_ma)
print ('Medical Assistance model accuracy = {0}%'.format(accuracy*100))

Drug Violation model accuracy = 52.49156987635819%
Medical Assistance model accuracy = 38.04795803671787%


### Precision
Precision is the ratio of the correctly +ve labeled by our program to all +ve labeled.

Precision answers the following: How many of those crime incidences we labeled as drug violation are actually drug violations?

Precision = TP/(TP+FP)

numerator: +ve labeled crime violations.

denominator: all +ve labeled by our program (whether they’re drug violantions or not in reality).

In [47]:
#calculate TP
pred_dv = predictions_dv == ytest_dv
true_results_dv = ytest_dv[pred_dv]

pred_ma = predictions_ma == ytest_ma
true_results_ma = ytest_ma[pred_ma]

pd.crosstab(index=true_results_dv, columns="count")
pd.crosstab(index=true_results_ma, columns="count")

col_0,count
Drug Violation,Unnamed: 1_level_1
0,2564
1,238


col_0,count
Medical Assistance,Unnamed: 1_level_1
0,1804
1,227


In [48]:
TP_dv = np.sum(true_results_dv==1)
TN_dv = np.sum(true_results_dv==0)

TP_ma = np.sum(true_results_ma==1)
TN_ma = np.sum(true_results_ma==0)

In [49]:
#calculate false results
false_results_dv = ytest_dv[~pred_dv]
false_results_ma = ytest_ma[~pred_ma]
pd.crosstab(index=false_results_dv, columns="count")
pd.crosstab(index=false_results_ma, columns="count")

col_0,count
Drug Violation,Unnamed: 1_level_1
0,2446
1,90


col_0,count
Medical Assistance,Unnamed: 1_level_1
0,3207
1,100


In [50]:
FP_dv = np.sum(false_results_dv==0)
FN_dv = np.sum(false_results_dv==1)

FP_ma = np.sum(false_results_ma==0)
FN_ma = np.sum(false_results_ma==1)

In [51]:
precision = TP_dv/(TP_dv+FP_dv)
print ('Drug Violation model precision = {0}%'.format(precision*100))
precision = TP_ma/(TP_ma+FP_ma)
print ('Medical Assistance model precision = {0}%'.format(precision*100))

Drug Violation model precision = 8.86736214605067%
Medical Assistance model precision = 6.6103669190448455%


### Recall (aka Sensitivity)

Recall is the ratio of the correctly +ve labeled by our model to all crime incidences that are drug violations in reality.

Recall answers the following question: Of all the incidences that are drug violations, how many of those we correctly predicted?

Recall = TP/(TP+FN)

numerator: +ve labeled drug violations incidences.

denominator: all incidences that are drug violations (whether detected by our model or not)

In [52]:
recall = TP_dv/(TP_dv+FN_dv)
print ('Drug Violation model recall = {0}%'.format(recall*100))
recall = TP_ma/(TP_ma+FN_ma)
print ('Medical Assistance model recall = {0}%'.format(recall*100))

Drug Violation model recall = 72.5609756097561%
Medical Assistance model recall = 69.41896024464832%


### F1-score (aka F-Score / F-Measure)

F1 Score considers both precision and recall.

It is the harmonic mean(average) of the precision and recall.

F1 Score is best if there is some sort of balance between precision (P) & recall (R) in the system. Oppositely F1 Score isn’t so high if one measure is improved at the 
expense of the other.

For example, if P is 1 & R is 0, F1 score is 0.

F1 Score = 2*(Recall * Precision) / (Recall + Precision)

In [53]:
precision = TP_dv/(TP_dv+FP_dv)
recall = TP_dv/(TP_dv+FN_dv)
F1_score = 2*(recall * precision) / (recall + precision)
print ('Drug Violation model F1_score = {0}%'.format(F1_score*100))
precision = TP_ma/(TP_ma+FP_ma)
recall = TP_ma/(TP_ma+FN_ma)
F1_score = 2*(recall * precision) / (recall + precision)
print ('Medical Assistance model F1_score = {0}%'.format(F1_score*100))

Drug Violation model F1_score = 15.803452855245684%
Medical Assistance model F1_score = 12.071257644243554%


### Specificity

Specificity is the correctly -ve labeled by the model to all that are not drug violations in reality.

Specifity answers the following question: Of all the incidences that are non drug violations, how many of those did we correctly predicted?

Specificity = TN/(TN+FP)

numerator: -ve labeled non drug violations incidences.

denominator: all incidences that are non drug violation in reality (whether +ve or -ve labeled)

In [54]:
specificity = TN_dv/(TN_dv+FP_dv)
print ('Drug Violation model specificity = {0}%'.format(specificity*100))
specificity = TN_ma/(TN_ma+FP_ma)
print ('Medical Assistance model specificity = {0}%'.format(specificity*100))

Drug Violation model specificity = 51.17764471057884%
Medical Assistance model specificity = 36.0007982438635%
