In [153]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

#### Everything from adam_simple_prediction 

In [210]:
crime_chicago = pd.read_pickle('../crime_chicago_with_timestamp.pkl')

In [231]:
crime_chicago.index = pd.DatetimeIndex(crime_chicago.datetime)

In [232]:
min_date = crime_chicago.index.min()
max_date = crime_chicago.index.max()
nobs = len(crime_chicago.index.unique())

In [233]:
min_date

Timestamp('2001-01-01 00:00:00')

In [234]:
weather_chicago = pd.read_csv('../WeatherChicago20012016.csv')

def weather_date_to_datetime(date):
    return pd.datetime(int(date[0:4]), int(date[4:6]), int(date[6:]))

weather_chicago['DATE'] = weather_chicago['DATE'].map(lambda x: weather_date_to_datetime(str(x)))
weather_chicago.index = pd.DatetimeIndex(weather_chicago['DATE'])
tmin_mean = weather_chicago['TMIN'].mean()
tmax_mean = weather_chicago['TMAX'].mean()

weather_chicago.loc[weather_chicago['TMIN'] == -9999, ['TMIN']] = tmin_mean
weather_chicago.loc[weather_chicago['TMAX'] == -9999, ['TMAX']] = tmax_mean
weather_chicago.loc[weather_chicago['PRCP'] == -9999, ['PRCP']] = 0
weather_chicago.loc[weather_chicago['AWND'] == -9999, ['AWND']] = 0

In [235]:
number_of_bins = 2
no = number_of_bins

lat_min = crime_chicago['Latitude'].min()
lon_min = crime_chicago['Longitude'].min()

lat_span = crime_chicago['Latitude'].max() - crime_chicago['Latitude'].min()
lon_span = crime_chicago['Longitude'].max() - crime_chicago['Longitude'].min()

lat_step = lat_span / no
lon_step = lon_span / no

crime_chicago['bin'] = \
    no*np.floor((crime_chicago['Longitude']-lon_min)/lon_step) + \
       np.floor((crime_chicago['Latitude'] -lat_min)/lat_step)

In [236]:
def getPolygonForBin(bin_no):
    lat, lon = getCornerLatLonForBin(bin_no)
    return Polygon([
            (lat,            lon),
            (lat,            lon + lon_step),
            (lat + lat_step, lon + lon_step),
            (lat + lat_step, lon),
        ])
def getPolygonForBinReverse(bin_no):
    lat, lon = getCornerLatLonForBin(bin_no)
    return Polygon([
            (lon,            lat),
            (lon + lon_step, lat),
            (lon + lon_step, lat + lat_step),
            (lon,            lat + lat_step),
        ])

# THEFTS ONLY

In [237]:
# crime_chicago_copy = crime_chicago.copy()

In [238]:
# crime_chicago = crime_chicago_copy.copy()

In [239]:
# crime_chicago['Primary Type'].unique()

In [240]:
# crime_chicago = crime_chicago[crime_chicago['Primary Type'] == 'THEFT']

#### END OF CHOOSING CRIME TYPE

In [241]:
crime_chicago_count_ref = crime_chicago[['datetime','bin']].groupby(['datetime','bin']).size().reset_index().rename(columns={0:'count'})

In [242]:
def get_counts_by_date_for_bin(bin_no):
    crime_chicago_count_ref_temp = crime_chicago_count_ref.copy()
    crime_chicago_count_ref_temp.index = pd.DatetimeIndex(crime_chicago_count_ref['datetime'])
    crime_chicago_count_ref_temp = crime_chicago_count_ref_temp[crime_chicago_count_ref_temp['bin'] == bin_no]
    crime_chicago_count_ref_temp = crime_chicago_count_ref_temp.drop(['bin', 'datetime'], axis=1)
    crime_chicago_count_ref_temp = crime_chicago_count_ref_temp.reindex(pd.date_range(min_date.strftime('%Y-%m-%d'), periods=nobs, freq='D'))
    crime_chicago_count_ref_temp = crime_chicago_count_ref_temp.fillna(0)
    return crime_chicago_count_ref_temp

## Prediction

In [243]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.stats.stats import pearsonr

In [244]:
def add_dummies(temp):
    temp['weekday'] = temp.index.weekday
    temp['yearday'] = temp.index.dayofyear

    weekday_dummies = pd.get_dummies(temp['weekday'], prefix='weekday')
    weekday_dummies.index = temp.index

    yearday_dummies = pd.get_dummies(temp['yearday'], prefix='yearday')
    yearday_dummies.index = temp.index

    temp = temp.join(weekday_dummies).join(yearday_dummies)
    
    return temp

In [245]:
def append_weather(bin_crime_data):
    bin_crime_data_weather = bin_crime_data.join(weather_chicago[['PRCP', 'TMAX','AWND']])
    indeces = pd.isnull(bin_crime_data_weather).any(1)
    bin_crime_data_weather.drop(bin_crime_data_weather.index[indeces], inplace=True)
    return bin_crime_data_weather

In [246]:
def fit_sm_predict_row(endog, exog, predict_row):
    res_temp = sm.GLM(endog, exog, family=sm.families.Poisson(link=sm.families.links.log)).fit()

    #crime_chicago_count_weather_weekdays.iloc[1,:]
    prediction = res_temp.predict(predict_row)
    
    return prediction

In [258]:
def predict_for_bin(bin_crime_data, next_day_row):
    bin_crime_data = append_weather(bin_crime_data)
    
    bin_crime_data = add_dummies(bin_crime_data)
    
    #TODO change
    endog = bin_crime_data.loc[:, ['count']]
    exog = bin_crime_data.loc[:, 'PRCP':]
    exog = exog.drop(['AWND','weekday','yearday'], axis=1)
    
    prediction = fit_sm_predict_row(endog, exog, next_day_row)
    
    return prediction[0]

### Putting it all together

In [252]:
def generate_nextday(temp, prcp):
    columns = ['TMAX','PRCP']
    for i in range(7):
        columns.append('weekday_' + str(i))
    for i in range(1, 367):
        columns.append('yearday_' + str(i))
    next_day_row = pd.DataFrame(columns = columns)
    next_day_row.loc[0,:] = np.zeros(len(next_day_row.columns))
    next_day_row.loc[0,'TMAX'] = 1
    next_day_row.loc[0,'PRCP'] = 2

    tomorrow = (datetime.date.today() + datetime.timedelta(days=1))
    next_day_row.loc[0,'weekday_' + str(tomorrow.weekday())] = 1
    day_of_the_year = tomorrow.timetuple().tm_yday
    next_day_row.loc[0,'yearday_' + str(day_of_the_year)] = 1
    
    return next_day_row

In [256]:
def generate_predictions_for_all_bins(temp, prcp):
    df = pd.DataFrame()
    df['bin'] = np.arange(no*no)
    
    df['count'] = df['bin'].map(lambda x: 
        predict_for_bin(
            get_counts_by_date_for_bin(x),
            generate_nextday(temp, prcp)
        )
    )
    
    return df

In [259]:
dfr = generate_predictions_for_all_bins(0,0)
dfr

Unnamed: 0,bin,count
0,0,93.419674
1,1,340.25044
2,2,329.245907
3,3,179.51238


- different crime types
- bins
- timescales
- only shopping centre we're interested in (by hour?)

## 1
- it's provided df with counted crimes
- appends weather
- finds empty records and drops them
- predicts and plots

## 2
- bins crimes within given two corners

## 3
- chooses a crime type or provides all