In [162]:
# Dependencies
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd
import numpy as np
import plotly.express as px
import json
import datetime
import os
import random as ran

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
cwd = os.getcwd()
# load the weather data from a json
with open(cwd + "/mike_analytics/LA_weather_data.json", 'r') as myfile:
    data=myfile.read()

# parse file
data_json = json.loads(data)
print(data_json[60]['dt_iso'])

2012-10-04 01:00:00 +0000 UTC


In [118]:
# create a data frame
clouds = []
temp = []
dt_iso = []

# loop through hours of weather data and get interesting 
for hour in data_json:
    clouds.append(hour['clouds']['all'])
    temp.append(hour['main']['temp']-273.14)
    
    # convert date and time stamp to pandas datetime
    dt_iso.append(pd.to_datetime(hour['dt_iso'][0:19], infer_datetime_format=True, utc=True))
    
# create dictionary of data
weather_dict = { 
    "Time [UTC]": dt_iso,
    "Cloud Cover [%]": clouds,
    "Temperature [°C]": temp
}

In [122]:
# create dataframe
weather_df = pd.DataFrame(weather_dict)
weather_df.head()

Unnamed: 0,Time [UTC],Cloud Cover [%],Temperature [°C]
0,2012-10-01 13:00:00+00:00,8,18.73
1,2012-10-01 14:00:00+00:00,8,18.724813
2,2012-10-01 15:00:00+00:00,8,18.719594
3,2012-10-01 16:00:00+00:00,8,18.714406
4,2012-10-01 17:00:00+00:00,8,18.709219


In [121]:
# plot temperature
fig = px.line(weather_df, x="Time [UTC]", y="Temperature [°C]")
fig.show()

In [127]:
weather_df = weather_df.set_index("Time [UTC]")
weather_df.head()

Unnamed: 0_level_0,Cloud Cover [%],Temperature [°C]
Time [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1
2012-10-01 13:00:00+00:00,8,18.73
2012-10-01 14:00:00+00:00,8,18.724813
2012-10-01 15:00:00+00:00,8,18.719594
2012-10-01 16:00:00+00:00,8,18.714406
2012-10-01 17:00:00+00:00,8,18.709219


In [128]:
# resample as daily average temperature
daily_avg_df = weather_df.resample('d').mean()
daily_avg_df.head()

Unnamed: 0_level_0,Cloud Cover [%],Temperature [°C]
Time [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1
2012-10-01 00:00:00+00:00,8.0,18.704006
2012-10-02 00:00:00+00:00,6.083333,22.749839
2012-10-03 00:00:00+00:00,1.708333,25.868542
2012-10-04 00:00:00+00:00,55.666667,22.857917
2012-10-05 00:00:00+00:00,38.875,19.808333


In [129]:
# plot temperature
fig = px.line(daily_avg_df, y="Temperature [°C]")
fig.show()

# Generate Random Crime Data

In [148]:
# initialize variables
domestic_v = []
robbery = []
arrests = []

# create arrays that are the same length as weather df
for temp in daily_avg_df["Temperature [°C]"]:
    robbery.append(ran.randint(0,100))
    domestic_v.append(ran.randint(0,100))
    arrests.append(ran.randint(0,100))

# create dictionary of data
alldata_dict = { 
    "Time [UTC]": daily_avg_df.index,
    "Cloud Cover [%]": daily_avg_df["Cloud Cover [%]"],
    "Temperature [°C]": daily_avg_df["Temperature [°C]"],
    "Domestic Violence": domestic_v,
    "Robbery": robbery,
    "Arrests": arrests
}

alldata_df = pd.DataFrame(alldata_dict)
alldata_df = alldata_df.set_index("Time [UTC]")
alldata_df.head()

Unnamed: 0_level_0,Cloud Cover [%],Temperature [°C],Domestic Violence,Robbery,Arrests
Time [UTC],Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-10-01 00:00:00+00:00,8.0,18.704006,69,5,9
2012-10-02 00:00:00+00:00,6.083333,22.749839,41,9,29
2012-10-03 00:00:00+00:00,1.708333,25.868542,34,12,100
2012-10-04 00:00:00+00:00,55.666667,22.857917,9,96,86
2012-10-05 00:00:00+00:00,38.875,19.808333,14,17,91


In [165]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=alldata_df.index, y=alldata_df['Temperature [°C]'], name="Temperature [°C]"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=alldata_df.index, y=alldata_df['Domestic Violence'], name="Domestic Violence"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="TBD"
)

# Set x-axis title
fig.update_xaxes(title_text="Time [UTC]")

# Set y-axes titles
fig.update_yaxes(title_text="<b>primary</b> Temperature [°C]", secondary_y=False)
fig.update_yaxes(title_text="<b>secondary</b> Domestic Violence", secondary_y=True)

fig.show()

# Statistical Analysis

In [194]:
# sample some data
samples = 250

# generate the range of possible rows
row_opts = list(np.arange(0,len(alldata_df),1)) 
row_sample = random.sample(row_opts, samples)

# sample from the dataset
sample1 = alldata_df.iloc[row_sample,:]

# generate the range of possible rows
row_opts = list(np.arange(0,len(alldata_df),1)) 
row_sample = random.sample(row_opts, samples)

# sample from the dataset
sample2 = alldata_df.iloc[row_sample,:]

# plot temperature vs crime
fig = px.scatter(sample1, x="Temperature [°C]", y="Arrests")
fig.show()

In [197]:
stats.ttest_ind(sample1["Temperature [°C]"], sample1["Arrests"], equal_var=False)

Ttest_indResult(statistic=-16.19786768454455, pvalue=2.39007450684557e-41)

Unnamed: 0,Cloud Cover [%],Temperature [°C]
2012-12-20 04:00:00+00:00,1,10.42
2012-12-17 08:00:00+00:00,90,12.22
2012-11-11 11:00:00+00:00,1,8.31
2012-11-18 18:00:00+00:00,90,16.95
2012-11-24 05:00:00+00:00,1,14.27
...,...,...
2013-01-07 21:00:00+00:00,75,13.38
2012-10-16 02:00:00+00:00,1,24.33
2012-12-11 23:00:00+00:00,1,20.62
2012-12-12 01:00:00+00:00,1,17.77
