In [110]:
#imports
import requests
import pandas as pd
import datetime as dt
from google.cloud import bigquery
from google.cloud import storage
from shapely.geometry  import shape, Point
from geopy.geocoders import Nominatim
import numpy as np
import meteostat as ms
import os
import io
import json

In [111]:
CREDS = "C:/Users/tkkim/gcp_keys/capstone-team51-366963bafc54.json"
storage_client = storage.Client.from_service_account_json(json_credentials_path=CREDS,project='capstone-team51')
bq_client = bigquery.Client.from_service_account_json(json_credentials_path=CREDS,project='capstone-team51')
#client = bigquery.Client(project='capstone-team51')


In [112]:
geolocator = Nominatim(user_agent="test_tk")

In [113]:
bucket = storage_client.get_bucket('capstone-team51-data')

In [114]:
def get_commarea_env(df_, commarea):
    dataframe = df_.copy()
    dataframe['community_area']=''
    for row in dataframe.index:
        point = Point(dataframe.loc[row,'longitude'],dataframe.loc[row,'latitude'])
        #print(point.xy)
        for feature in commarea['features']:
            polygon = shape(feature['geometry'])
            if polygon.contains(point):
                dataframe.loc[row, 'community_area'] = feature['properties']['area_numbe']
                
    return dataframe

In [115]:
def getpoint(address):
    loc = geolocator.geocode(address)
    latlong = [loc.latitude, loc.longitude]
    point = Point(latlong[1], latlong[0])   
    return point

In [116]:
def findarea(point, commarea):    
    for feature in commarea['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
           return feature['properties']['area_numbe']
        else:
            continue
        

In [117]:
def load_df_gcs(prefix, filename):
    df = pd.DataFrame()
    for blob in storage_client.list_blobs('capstone-team51-data', prefix=prefix, delimiter='/'):
        #print(blob.name)
        if filename in blob.name:
            #print(blob.name)
            data = blob.download_as_bytes()
            smalldf = pd.read_csv(io.BytesIO(data))
            #print(smalldf)
            df = pd.concat([df, smalldf])
    return df

In [118]:
base_df = load_df_gcs('raw_crimes/', 'chunk')
three11_df = load_df_gcs('raw_311/', 'chunk')
env_df = load_df_gcs('raw_environmental/', 'chunk')
weather_df = load_df_gcs('raw_weather/', 'data')
offenders_df = load_df_gcs('supporting/off_commarea/', 'csv')

In [119]:
#stations_blob = bucket.blob('supporting_data/Police_Stations_20240120.csv').download_as_bytes()
stations_df = load_df_gcs('supporting/', 'Police_Stations_20240120.csv')

In [120]:
commarea = json.loads(bucket.blob('supporting/geojsons/Boundaries - Community Areas (current).geojson').download_as_string())

In [121]:
env_df_ca = get_commarea_env(env_df, commarea)

In [122]:
env_df_ca['community_area'] = np.where(env_df_ca['community_area'] == '', 9999, env_df_ca['community_area'])

In [123]:
env_df_ca.columns

Index(['complaint_id', 'complaint_type', 'address', 'street_number',
       'direction', 'street_name', 'street_type', 'inspector',
       'complaint_date', 'inspection_log', 'data_source', 'modified_date',
       'latitude', 'longitude', 'location.type', 'location.coordinates',
       'complaint_detail', 'community_area'],
      dtype='object')

In [187]:
# dph env complaints

grouped_complaints_by_commarea = env_df_ca.groupby(['complaint_date','community_area']).agg(total_complaints_in_ca=('complaint_id','count')).reset_index()
grouped_complaints_by_commarea.rename(columns={'complaint_date':'date'}, inplace=True)
grouped_complaints_by_commarea['date'] = pd.to_datetime(grouped_complaints_by_commarea['date']).dt.date
grouped_complaints_by_commarea['community_area'] = grouped_complaints_by_commarea['community_area'].astype(int)

In [188]:
grouped_complaints_by_commarea.dty

AttributeError: 'DataFrame' object has no attribute 'dty'

In [189]:
# 311 complaints
grouped_date = three11_df.groupby([three11_df['created_date'],'community_area','sr_type']).agg(total_count=('sr_number','count')).reset_index()
pivoted = grouped_date.pivot_table(index=['created_date','community_area'],columns='sr_type',values='total_count',aggfunc='sum').fillna(0).reset_index()
pivoted['created_date'] = pd.to_datetime(pivoted['created_date'])
pivoted.rename(columns={'created_date':'date'}, inplace=True)
pivoted['community_area'] = pivoted['community_area'].astype(int)
pivoted['date'] = pd.to_datetime(pivoted['date']).dt.date

In [216]:
# merge offenders in
big_df = base_df.copy().merge(offenders_df, how='left', on='community_area')
big_df['date'] = pd.to_datetime(big_df['date']).dt.date
big_df['community_area'] = big_df['community_area'].fillna(9999).astype('int64')

In [217]:
# merge weather in
weather_df.rename(columns={'time':'date'}, inplace=True)
weather_df['date'] = pd.to_datetime(weather_df['date']).dt.date
big_df['date'] = pd.to_datetime(big_df['date']).dt.date
big_df = big_df.merge(weather_df, how='left', on='date')

In [199]:
# merge dph env complaints
big_df = big_df.merge(grouped_complaints_by_commarea, how='left', on=['date','community_area'])

In [201]:
big_df = big_df.merge(pivoted,how='left', on=['date', 'community_area'])