In [1]:
import pandas as pd
import numpy as np 
import string

In [2]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [3]:
plt.style.use('ggplot')

In [4]:
df = pd.read_csv('train.csv', header=0)

In [5]:
df= df.drop(['Descript','X','Y'], axis=1)

In [6]:
tmp= pd.DataFrame(df.Dates.str.split().tolist(), columns="date time".split())

In [7]:
df['Hour']= tmp.apply(lambda row: row['time'].split(':')[0], axis=1)

In [8]:
df= df.drop(['Dates'], axis=1)

In [9]:
df.Cat= df.Category.astype('category')

In [10]:
cat_dict= dict(zip(df.Cat.cat.categories, range(len(df.Cat.cat.codes))))

In [11]:
df.dist= df.PdDistrict.astype('category')
dist_dict= dict(zip(df.dist.cat.categories, range(len(df.dist.cat.codes))))

In [12]:
cat_dist= pd.pivot_table(df, index="Category", columns="PdDistrict", values="DayOfWeek", aggfunc="count")

In [13]:
cplot= cat_dist.plot(kind='barh', figsize=(30,15), grid=0, colormap=cm.rainbow, stacked= True)
plt.savefig("stackedBar3",ext="png", transparent=0)

In [14]:
#Import and preparation of data

import zipfile
import matplotlib.pyplot as plt
import datetime as df
import time

In [15]:
z_train = zipfile.ZipFile('train.csv.zip')
z_test  = zipfile.ZipFile('test.csv.zip')
train = pd.read_csv(z_train.open('train.csv'), parse_dates=['Dates'])
test  = pd.read_csv(z_test.open('test.csv'),   parse_dates=['Dates'])

In [16]:
#Add day of the year format 02-22
train['DayOfYear'] = train['Dates'].map(lambda x: x.strftime("%m-%d"))
test['DayOfYear']  = test['Dates'].map( lambda x: x.strftime("%m-%d"))

In [17]:
train_days = train[["X", "DayOfYear"]].groupby(['DayOfYear']).count().rename(columns={"X": "TrainCount"})
test_days  = test[ ["X", "DayOfYear"]].groupby(['DayOfYear']).count().rename(columns={"X": "TestCount"})

In [18]:
days = train_days.merge(test_days, left_index=True, right_index=True)
days["TotalCount"] = days["TrainCount"] + days["TestCount"]

In [19]:
days.plot(figsize=(15,10)) 
plt.title("The two peaks per month pattern is entirely explained by splitting the data into train/test sets")
plt.ylabel('Number of crimes')
plt.xlabel('Day of year')
plt.grid(True)

In [20]:
plt.savefig('Distribution_of_Crimes_by_DayofYear.png')

In [21]:
#EAD

In [22]:
z = zipfile.ZipFile('train.csv.zip')
print(z.namelist())

['train.csv']


In [23]:
train = pd.read_csv(z.open('train.csv'), parse_dates=['Dates'])

In [24]:
train['Year'] = train['Dates'].map(lambda x: x.year)
train['Week'] = train['Dates'].map(lambda x: x.week)
train['Hour'] = train['Dates'].map(lambda x: x.hour)


In [25]:
print(train.head())

                Dates        Category                      Descript  \
0 2015-05-13 23:53:00        WARRANTS                WARRANT ARREST   
1 2015-05-13 23:53:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
2 2015-05-13 23:33:00  OTHER OFFENSES      TRAFFIC VIOLATION ARREST   
3 2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   
4 2015-05-13 23:30:00   LARCENY/THEFT  GRAND THEFT FROM LOCKED AUTO   

   DayOfWeek PdDistrict      Resolution                    Address  \
0  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
1  Wednesday   NORTHERN  ARREST, BOOKED         OAK ST / LAGUNA ST   
2  Wednesday   NORTHERN  ARREST, BOOKED  VANNESS AV / GREENWICH ST   
3  Wednesday   NORTHERN            NONE   1500 Block of LOMBARD ST   
4  Wednesday       PARK            NONE  100 Block of BRODERICK ST   

            X          Y  Year  Week  Hour  
0 -122.425892  37.774599  2015    20    23  
1 -122.425892  37.774599  2015    20    23  
2 -122.424363  37

In [26]:
train.PdDistrict.value_counts().plot(kind='bar', figsize=(8,10))
plt.savefig('district_counts.png')

In [27]:
train['event']=1
weekly_events = train[['Week','Year','event']].groupby(['Year','Week']).count().reset_index()
weekly_events_years = weekly_events.pivot(index='Week', columns='Year', values='event').fillna(method='ffill')
#%matplotlib inline
ax = weekly_events_years.interpolate().plot(title='number of cases every 2 weeks', figsize=(10,6))
plt.savefig('events_every_two_weeks.png')

In [28]:
hourly_events = train[['Hour','event']].groupby(['Hour']).count().reset_index()
hourly_events.plot(kind='bar', figsize=(6, 6))
plt.savefig('hourly_events.png')

In [29]:
hourly_district_events = train[['PdDistrict','Hour','event']].groupby(['PdDistrict','Hour']).count().reset_index()
hourly_district_events_pivot = hourly_district_events.pivot(index='Hour', columns='PdDistrict', values='event').fillna(method='ffill')
hourly_district_events_pivot.interpolate().plot(title='number of cases hourly by district', figsize=(10,6))
plt.savefig('hourly_events_by_district.png')