In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import requests
from io import StringIO
import matplotlib.pyplot as plt 

In [None]:
#Loading dataset into Jupyter environment - a security warning will appear. You can ignore it.
url="https://gitlab.gitlab.svc.cent-su.org/ccaicedo/652public/-/raw/master/crimes_2018.csv"
csvdata=requests.get(url,verify=False).text  #this will generate a warning but you can proceed

In [None]:
#Setup the crimes dataframe with the data from the dataset correctly formatted. The index will be the Date column
crimes=pd.read_csv(StringIO(csvdata),parse_dates=[0], index_col=[0])

In [None]:
crimes.head()

In [None]:
crimes.info()

In [None]:
daily_crimes = crimes['Primary Type'].resample('D').count()  #resample, count crimes per day
daily_crimes.sort_index(inplace=True)

In [None]:
#Dataset contains the crime data from January 2018 to February 2021
daily_crimes.head()
#daily_crimes.tail()

In [None]:
#Will work just with the 2018 data
daily_crimes2018 = daily_crimes['2018']
daily_crimes2018.plot()

Let's bring in the 2018 daily weather (Temperature) data

In [None]:
weather_file_path = 'chicago_2018_weather.csv'

# parsing weather data and making the dates column the index
weather2018 = pd.read_csv(weather_file_path, index_col=5, parse_dates=True)
weather2018.head()

In [None]:
#Exploring the weather data.... what's wrong ??
weather2018.info()

In [None]:
#Let's add the median temperature, since we don't have the "Average" temperature in the data
medianTemp=((weather2018.TMAX + weather2018.TMIN)/2)
medianTemp.head()

In [None]:
#Adding a new column of data to the dataframe. There are several ways to do this.
weather2018['medianT']=medianTemp
weather2018.head()

In [None]:
#Are there missing values?
weather2018.isna().sum()

We need to handle missing values AND missing items in the time series (361 of 365 days reported)

In [None]:
#Resample data so ALL days of the year are listed. Use ffill to fill the information for missing days.
weather2018=weather2018.resample('D').ffill()

In [None]:
weather2018.info()

In [None]:
# Finding the missing values
weather2018[weather2018.TMIN.isnull()]

In [None]:
weather2018.TMIN.fillna(method='ffill',inplace=True)

In [None]:
weather2018.info()

In [None]:
weather2018['2018-05']

Let's use .loc to get the data record we want (as suggested by the previous warning message)

In [None]:
weather2018.loc['2018-05-30']

In [None]:
#Filling the missing values 
weather2018.medianT.fillna(method='ffill',inplace=True)

In [None]:
weather2018.loc['2018-05-30']

## Plots

In [None]:
weather2018.medianT.plot()

Let's plot the crime and weather together

In [None]:
daily_temp2018=weather2018.medianT

In [None]:
daily_temp2018.plot()
daily_crimes2018.plot()
plt.legend()

Better make separate plots (use subplots)

In [None]:
figsize = (8,3)
plt.figure()
daily_temp2018.plot(title='Temperature', figsize=figsize)
plt.ylabel('Median daily temperature (F)')
plt.figure()
daily_crimes2018.plot(title='Crimes', figsize=figsize)
plt.ylabel('Crimes count per day')
plt.show()

Let's smooth the plots by using a "rolling" mean.\
rolling(X).mean() = compute the mean over the last X samples

In [None]:
daily_crimes2018.rolling(10).mean().plot()

Let's build a slider widget to better control the smoothing

In [None]:
from ipywidgets import interact, widgets

@interact(periods=widgets.IntSlider(min=10,max=100,step=10,value=50))
def plot_moving_average(periods=50):
    figsize = (8,3)
    plt.figure()
    daily_temp2018.rolling(periods).mean().plot(title='Temperature', figsize=figsize)
    plt.ylabel('Average daily temperature')
    plt.figure()
    daily_crimes2018.rolling(periods).mean().plot(title='Crimes', figsize=figsize)
    plt.ylabel('Crimes count per day')
    plt.show()

## Computing correlations

In [None]:
combined2018 = pd.DataFrame({'crimes': daily_crimes2018, 'temperature': daily_temp2018})
combined2018.head()

Scatter plot

In [None]:
combined2018.plot(kind='scatter', x='temperature', y='crimes')

There is some correlation. Let's compute it

In [None]:
combined2018.corr() # Pearson correlation by default – the standard correlation coefficient

Let's look at the outliers

In [None]:
combined2018[combined2018.crimes > 950]

## Correlation matrix

In [None]:
crimes.info()

In [None]:
#Just process the 2018 data
crimes2018=crimes.loc['2018']

Group crimes by type, count the occurrences per day so that later we can generate a column based organization of the data (i.e. split the counts into different columns)

In [None]:
crimesB = crimes2018.set_index(['Primary Type'], append=True)
crimesB['ocurrences'] = np.ones(len(crimesB), dtype=int)
crimes_grouped = crimesB.ocurrences.groupby(level=[0,1]).sum()
crimes_grouped

Splitting the data into a column based display (unstacking)

In [None]:
crimes_by_type = crimes_grouped.unstack(level=1).fillna(0).resample('D').sum().fillna(0).astype(int)
crimes_by_type.head()

In [None]:
crimes_detail2018 = crimes_by_type
crimes_detail2018['temperature'] = daily_temp2018
crimes_detail2018.tail()

In [None]:
corr = crimes_detail2018.corr()
corr.style.background_gradient()   #the higher the correlation value, the darker the color

Let's focus on the correlation with temperature

In [None]:
corr = crimes_detail2018.corr().loc[:,['temperature']].sort_values(ascending=False, by='temperature')
corr.style.background_gradient()

Let's check on the amount of data for each crime type

In [None]:
crimes_by_type.sum().sort_values(ascending=False)