In [1]:
import pandas as pd 

### **Reading the Dataset**

stops: The number of police stops between January 1998 and March 1999<br>
past.arrests: The number of arrests that took place in 1997<br>
precinct: Index for the precinct (1 - 75)<br>
eth: Indicator for ethinicity, black (1), hispanic (2), white (3)<br>
crime: Indicator for type of crime, violent (1), weapons (2), property (3), drug (4). 

In [16]:
url = "http://www.stat.columbia.edu/~gelman/arm/examples/police/frisk_with_noise.dat"
df = pd.read_csv(url, skiprows=6, delimiter=" ")
df.head()

Unnamed: 0,stops,pop,past.arrests,precinct,eth,crime
0,75,1720,191,1,1,1
1,36,1720,57,1,1,2
2,74,1720,599,1,1,3
3,17,1720,133,1,1,4
4,37,1368,62,1,2,1


### **Data Pre-processing**

#### Removing Multi-Colinearlity 
- We do this by one-hot encoding. We have to remember that the type of data is a nominal data where there is not order assigned to the attributes 

In [18]:
#introducing black column
df['black'] = df['eth']
df['black'].replace({1:1, 2:0, 3:0}, inplace=True)

#introducing hispanic column
df['hispanic'] = df['eth']
df['hispanic'].replace({1:0, 2:1, 3:0}, inplace=True)

#introducing white column
df['white'] = df['eth']
df['white'].replace({1:0, 2:0, 3:1}, inplace=True)

#introducing violent column
df['violent'] = df['crime']
df['violent'].replace({1:1, 2:0, 3:0, 4:0}, inplace=True)

#introducing weapons column
df['weapons'] = df['crime']
df['weapons'].replace({1:0, 2:1, 3:0, 4:0}, inplace=True)

#introducing property column
df['property'] = df['crime']
df['property'].replace({1:0, 2:0, 3:1, 4:0}, inplace=True)

#introducing drug column
df['drug'] = df['crime']
df['drug'].replace({1:0, 2:0, 3:0, 4:1}, inplace=True)

In [19]:
df.head()

Unnamed: 0,stops,pop,past.arrests,precinct,eth,crime,black,hispanic,white,violent,weapons,property,drug
0,75,1720,191,1,1,1,1,0,0,1,0,0,0
1,36,1720,57,1,1,2,1,0,0,0,1,0,0
2,74,1720,599,1,1,3,1,0,0,0,0,1,0
3,17,1720,133,1,1,4,1,0,0,0,0,0,1
4,37,1368,62,1,2,1,0,1,0,1,0,0,0


### **Train and Test**

In [17]:
from sklearn.linear_model import PoissonRegressor 
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df[['black', 'hispanic', 'white', 'violent', 'weapons', 'property', 'drug']], 
                 df['stops'], test_size=0.2)

In [21]:
poisson = PoissonRegressor()

In [22]:
poisson.fit(X_train, y_train)

In [23]:
poisson.coef_

array([ 0.61041227,  0.18449164, -0.79490399,  0.05624995,  0.68921313,
       -0.11403067, -0.6314325 ])

In [24]:
poisson.intercept_

4.71991548812671

Average Observed Count = exp(intercept) = exp(4.7199) = 112.15<br><br>

Blacks have 1.84 times more stops than non-Blacks<br>
Hispanics have 1.20 times more stops than non-Hispanics<br>
Whites have 0.45 times less stops than non-Whites<br>