Devika:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
%matplotlib inline

In [None]:
train = pd.read_csv('hackathon_train.csv')
test = pd.read_csv('hackathon_test.csv')

In [None]:
print('Shape of the train data:', train.shape)
print('Shape of the test data:', test.shape)

Shape of the train data: (658536, 9)
Shape of the test data: (219513, 7)


Extracting features from Dates

In [None]:
train = pd.read_csv('hackathon_train.csv', parse_dates=['Dates'])

In [None]:
train['Year'] = train['Dates'].dt.year

In [None]:
train['Month'] = train['Dates'].dt.month

In [None]:
train['Hour'] = train['Dates'].dt.hour

In [None]:
train.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Year,Month,Hour
0,2011-12-04 18:15:00,SUSPICIOUS OCC,SUSPICIOUS OCCURRENCE,Sunday,PARK,NONE,100 Block of BEULAH ST,-122.452331,37.767356,2011,12,18
1,2009-01-11 19:57:00,WARRANTS,ENROUTE TO OUTSIDE JURISDICTION,Sunday,MISSION,"ARREST, BOOKED",18TH ST / CAPP ST,-122.418272,37.761903,2009,1,19
2,2007-01-25 18:15:00,NON-CRIMINAL,"AIDED CASE, MENTAL DISTURBED",Thursday,CENTRAL,NONE,1200 Block of STOCKTON ST,-122.408521,37.797492,2007,1,18
3,2012-01-10 08:55:00,ROBBERY,"ROBBERY, BODILY FORCE",Tuesday,NORTHERN,NONE,HAYES ST / FRANKLIN ST,-122.421333,37.77709,2012,1,8
4,2014-05-27 12:25:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Tuesday,TENDERLOIN,NONE,JONES ST / TURK ST,-122.412414,37.783004,2014,5,12


Most common crimes

In [None]:
train['Category'].unique()

array(['SUSPICIOUS OCC', 'WARRANTS', 'NON-CRIMINAL', 'ROBBERY',
       'VANDALISM', 'ASSAULT', 'DRUG/NARCOTIC', 'OTHER OFFENSES',
       'SECONDARY CODES', 'VEHICLE THEFT', 'LARCENY/THEFT', 'BURGLARY',
       'DRUNKENNESS', 'MISSING PERSON', 'RECOVERED VEHICLE', 'LOITERING',
       'PROSTITUTION', 'WEAPON LAWS', 'BAD CHECKS', 'FRAUD',
       'LIQUOR LAWS', 'DRIVING UNDER THE INFLUENCE', 'STOLEN PROPERTY',
       'TRESPASS', 'FORGERY/COUNTERFEITING', 'SEX OFFENSES FORCIBLE',
       'EMBEZZLEMENT', 'DISORDERLY CONDUCT', 'RUNAWAY', 'BRIBERY',
       'ARSON', 'KIDNAPPING', 'EXTORTION', 'FAMILY OFFENSES', 'GAMBLING',
       'SUICIDE', 'SEX OFFENSES NON FORCIBLE', 'PORNOGRAPHY/OBSCENE MAT',
       'TREA'], dtype=object)

In [None]:
train['Category'].unique().size

39

In [None]:
value_counts = train['Category'].value_counts()
desc_value_counts = value_counts.sort_values(ascending=False)
common_crimes = desc_value_counts.reset_index().head(10)
common_crimes.columns = ['Crime','Count']
data = [go.Bar(
    x = common_crimes.Crime,
    y = common_crimes.Count,
    opacity = 0.8
)]

py.iplot(data)

Crime Distribution across years

In [None]:
train['Year'].value_counts()

2013    56767
2014    56213
2003    55531
2004    54988
2012    53730
2005    53189
2008    52561
2006    52439
2009    51604
2007    50927
2010    49970
2011    49843
2015    20774
Name: Year, dtype: int64

In [None]:
value_counts = train['Year'].value_counts()
year_counts = value_counts.reset_index().sort_values(by='index')
year_counts.columns = ['Year','Count']

points = go.Scatter(
    x = year_counts.Year,
    y = year_counts.Count
)

data = [points]

py.iplot(data)

Crime Distribution based on months

In [None]:
month = train['Month'].value_counts().sort_values(ascending=False).reset_index()
month.columns = ['Month','Count']
data = [go.Bar(
    x = month.Month,
    y = month.Count,
    opacity = 0.8
)]

py.iplot(data)

Criminal activity for every month of the year

In [None]:
data = []
for i in range(2003, 2015):
    year = train[train['Year'] == i]
    year_count = year['Month'].value_counts().reset_index().sort_values(by='index')
    year_count.columns = ['Month','Count']
    points = go.Scatter(
        x = year_count.Month,
        y = year_count.Count,
        name = i
    )
    data.append(points)

py.iplot(data)

Crime Distribution of Districts over the Year

In [None]:
data = []
val = train['PdDistrict'].value_counts().reset_index()
val.columns = ['District','Count']
x = val.District

for i in x:
    district = train[train['PdDistrict']==i]
    year_count = district['Year'].value_counts().reset_index().sort_values(by='index')
    year_count.columns = ['Year','Count']
    points = go.Scatter(
        x = year_count.Year,
        y = year_count.Count,
        name = i
    )
    data.append(points)

py.iplot(data)

Districts with most crime

In [None]:
train['PdDistrict'].value_counts()

SOUTHERN      118141
MISSION        89792
NORTHERN       78701
BAYVIEW        67063
CENTRAL        64206
TENDERLOIN     61365
INGLESIDE      59174
TARAVAL        49204
PARK           36859
RICHMOND       34031
Name: PdDistrict, dtype: int64

In [None]:
desc_value_counts = train['PdDistrict'].value_counts().sort_values(ascending=False)
common_districts = desc_value_counts.reset_index().head(10)
common_districts.columns = ['District','Count']
data = [go.Bar(
    y = common_districts.District,
    x = common_districts.Count,
    orientation = 'h',
    opacity = 0.8,
)]

py.iplot(data)