In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_path = ("https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv")

In [60]:
raw_data = pd.read_csv(data_path)

In [64]:
## Clean up the data - remove first three rows that bad data
#raw_data = raw_data.iloc[3:]
## Make the first row the column headers
#raw_data.columns = raw_data.iloc[0]
## Remove the first row
#raw_data = raw_data.iloc[1:]

In [67]:
#Remove the bad column
raw_data = raw_data.drop('Rape\n(revised\ndefinition)1',1)

In [24]:
raw_data.columns

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson3'],
      dtype='object', name=3)

In [65]:
#Rename the columns to make it easier to code with
raw_data = raw_data.rename(columns={'Violent\ncrime': 'ViolentCrime',
                                    'Murder and\nnonnegligent\nmanslaughter': 'Murder',
                                    'Rape\n(legacy\ndefinition)2':'Rape',
                                    'Aggravated\nassault':'AggravatedAssault',
                                    'Property\ncrime':'PropertyCrime',
                                    'Larceny-\ntheft':'LarcenyTheft',
                                    'Motor\nvehicle\ntheft':'MotorTheft',
                                    'Arson3':'Arson'})

In [74]:
raw_data.dtypes

3
City                 object
Population           object
ViolentCrime         object
Murder               object
Rape                 object
Robbery              object
AggravatedAssault    object
PropertyCrime        object
Burglary             object
LarcenyTheft         object
MotorTheft           object
Arson                object
dtype: object

In [80]:
#Clean data to remove commas
raw_data['Population'] = raw_data['Population'].str.replace(',', '')
raw_data['ViolentCrime'] = raw_data['ViolentCrime'].str.replace(',', '')
raw_data['Murder'] = raw_data['Murder'].str.replace(',', '')
raw_data['Rape'] = raw_data['Rape'].str.replace(',', '')
raw_data['Robbery'] = raw_data['Robbery'].str.replace(',', '')
raw_data['AggravatedAssault'] = raw_data['AggravatedAssault'].str.replace(',', '')
raw_data['PropertyCrime'] = raw_data['PropertyCrime'].str.replace(',', '')
raw_data['Burglary'] = raw_data['Burglary'].str.replace(',', '')
raw_data['LarcenyTheft'] = raw_data['LarcenyTheft'].str.replace(',', '')
raw_data['MotorTheft'] = raw_data['MotorTheft'].str.replace(',', '')
raw_data['Arson'] = raw_data['Arson'].str.replace(',', '')

In [120]:
#Clean data to convert to float
raw_data['Population'] = raw_data['Population'].astype(float)
raw_data['ViolentCrime'] = raw_data['ViolentCrime'].astype(float)
raw_data['Murder'] = raw_data['Murder'].astype(float)
raw_data['Rape'] = raw_data['Rape'].astype(float)
raw_data['Robbery'] = raw_data['Robbery'].astype(float)
raw_data['AggravatedAssault'] = raw_data['AggravatedAssault'].astype(float)
raw_data['PropertyCrime'] = raw_data['PropertyCrime'].astype(float)
raw_data['Burglary'] = raw_data['Burglary'].astype(float)
raw_data['LarcenyTheft'] = raw_data['LarcenyTheft'].astype(float)
raw_data['MotorTheft'] = raw_data['MotorTheft'].astype(float)
raw_data['Arson'] = raw_data['Arson'].astype(float)

In [92]:
raw_data.head()

3,City,Population,ViolentCrime,Murder,Rape,Robbery,AggravatedAssault,PropertyCrime,Burglary,LarcenyTheft,MotorTheft,Arson
4,Adams Village,1861.0,0.0,0.0,0.0,0.0,0.0,12.0,2.0,10.0,0.0,0.0
5,Addison Town and Village,2577.0,3.0,0.0,0.0,0.0,3.0,24.0,3.0,20.0,1.0,0.0
6,Akron Village,2846.0,3.0,0.0,0.0,0.0,3.0,16.0,1.0,15.0,0.0,0.0
7,Albany,97956.0,791.0,8.0,30.0,227.0,526.0,4090.0,705.0,3243.0,142.0,
8,Albion Village,6388.0,23.0,0.0,3.0,4.0,16.0,223.0,53.0,165.0,5.0,


In [121]:
model_data = raw_data.filter(['City','Population','Murder','Robbery'], axis=1)

In [122]:
raw_data.dtypes

3
City                  object
Population           float64
ViolentCrime         float64
Murder               float64
Rape                 float64
Robbery              float64
AggravatedAssault    float64
PropertyCrime        float64
Burglary             float64
LarcenyTheft         float64
MotorTheft           float64
Arson                float64
dtype: object

In [129]:
model_data.dtypes

3
City             object
Population      float64
Murder          float64
Robbery         float64
Population_2    float64
dtype: object

In [131]:
model_data.head()

3,City,Population,Murder,Robbery,Population_2
4,Adams Village,1861.0,0.0,0.0,3463321.0
5,Addison Town and Village,2577.0,0.0,0.0,6640929.0
6,Akron Village,2846.0,0.0,0.0,8099716.0
7,Albany,97956.0,1.0,1.0,9595378000.0
8,Albion Village,6388.0,0.0,1.0,40806540.0


In [132]:
# Create feature variables - Population^2, If/Else statements with Murder/Robbery
model_data['Population_2'] = model_data['Population']**2
model_data['Murder'] = model_data['Murder'].clip_upper(1)
model_data['Robbery'] = model_data['Robbery'].clip_upper(1)