## Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
from google.cloud import bigquery
from google.oauth2 import service_account
from tqdm.notebook import tqdm_notebook
import time

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

------------------------------

## NYC Open Data Variables

In [2]:
# setup the host name for the API endpoint (the https:// part will be added automatically)
# only need to change this if you are not using NYC Open Data
data_url = 'data.cityofnewyork.us'

In [3]:
# setup the data set at the API endpoint (NYC Ferry Ridership data in this case)
# For example: https://data.cityofnewyork.us/resource/t5n6-gx8c.json
# would give us 'erm2-nwe9'
data_set1 = 'h9gi-nx95'
data_set2 = 'f55k-p6yu'

In [4]:
# Setup your App Token, which you created in Week 6
# You can find your app token by logging into: 
# https://data.cityofnewyork.us/profile/edit/developer_settings
#app_token = 'your app token here'
app_token = ''

In [5]:
# run this cell to setup your Socrata client that connects python to NYC Open Data

# create the client that points to the API endpoint
nyc_open_data_client = Socrata(data_url, app_token, timeout = 200)
print(f"nyc open data client name is: {nyc_open_data_client}")
print(f"nyc open data client data type is: {type(nyc_open_data_client)}")

nyc open data client name is: <sodapy.socrata.Socrata object at 0x0000028A6D928730>
nyc open data client data type is: <class 'sodapy.socrata.Socrata'>


------------------------------

## Data Extraction

In [6]:
# Get the total number of records in our the entire data set

#data_set1
total_record_count = nyc_open_data_client.get(data_set1, select = "COUNT(*)")
print(f"total records in data_set1, {data_set1}: {total_record_count[0]['COUNT']}")

#data_set2
total_record_count2 = nyc_open_data_client.get(data_set2, select = "COUNT(*)")
print(f"total records in data_set2, {data_set2}: {total_record_count2[0]['COUNT']}")

total records in data_set1, h9gi-nx95: 1941015
total records in data_set2, f55k-p6yu: 4848694


In [7]:
# Now, loop through target data set to pull all rows in chunks (we cannot pull all rows at once)
# AGAIN, UPDATE WHERE FILTER INSIDE BELOW FUNCTION

def extract_socrata_data(data_set,
                         # max. limit for version 2.0 API endpoints is 50000
                         chunk_size = 50000, 
                         where = None):
    
    # measure time this function takes
    import time
    start_time1 = time.time()
    
    # get total number or records
    if where == None:
        total_records = int(nyc_open_data_client.get(data_set,
                                                     select= "COUNT(*)")[0]["COUNT"])
    else:
        total_records = int(nyc_open_data_client.get(data_set,
                                                     where = where,
                                                     select= "COUNT(*)")[0]["COUNT"])
    
    # start at 0, empty list for results
    start = 0                   
    results = []   
 
    # for progress bar
    pbar = tqdm_notebook(desc = 'while loop', total = total_records)

    while True:

        if where == None:
            # fetch the set of records starting at 'start'
            results.extend(nyc_open_data_client.get(data_set,
                                                    offset = start,
                                                    limit = chunk_size))
            
        elif where != None:
            results.extend(nyc_open_data_client.get(data_set,
                                                    where = where,
                                                    offset = start,
                                                    limit = chunk_size))
        # update the starting record number
        start = start + chunk_size
        
        # Update progress bar
        print(start, end ='\r')
        pbar.update(chunk_size)

        # if we have fetched all of the records (we have reached total_records), exit loop
        if (start > total_records):
        #if (start > 50000):
            print("Loop completed")
            #close progress bar
            pbar.close()
            break

    # convert the list into a pandas data frame
    end_time1 = time.time()
    print(f"Loop took {round(end_time1 - start_time1, 1)} seconds")
    
    start_time2 = time.time()
    
    data = pd.DataFrame.from_records(results)

    end_time2 = time.time()
    print(f"Transforming to pandas.DataFrame took {round(end_time2 - start_time2, 1)} seconds")

    print(f"The shape of your dataframe is: {data.shape}")
    return data

In [8]:
data1 = extract_socrata_data(chunk_size = 50000,
                             data_set = data_set1)

while loop:   0%|          | 0/1941015 [00:00<?, ?it/s]

Loop completed
Loop took 140.1 seconds
Transforming to pandas.DataFrame took 4.7 seconds
The shape of your dataframe is: (1941015, 29)


In [9]:
data2 = extract_socrata_data(data_set = data_set2,
                             chunk_size = 50000)

while loop:   0%|          | 0/4848694 [00:00<?, ?it/s]

Loop completed
Loop took 303.8 seconds
Transforming to pandas.DataFrame took 9.0 seconds
The shape of your dataframe is: (4848694, 21)


------------------------------

## Data Profiling

In [10]:
data1.info(verbose=True)
print("\n")
data2.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1941015 entries, 0 to 1941014
Data columns (total 29 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [11]:
# create and run a function to ceate data profiling dataframe

def create_data_profiling_df(data):
    
    # create an empty dataframe to gather information about each column
    data_profiling_df = pd.DataFrame(columns = ["column_name",
                                                "column_type",
                                                "unique_values",
                                                "duplicate_values",
                                                "null_values",
                                                "non_null_values"])

    # loop through each column to add rows to the data_profiling_df dataframe
    for column in data.columns:

        info_dict = {}

        try:
            info_dict["column_name"] = column
            info_dict["column_type"] = data[column].dtypes
            info_dict["unique_values"] = len(data[column].unique())
            info_dict["duplicate_values"] = data[column].count() - len(data[column].dropna().unique())
            info_dict["null_values"] = data[column].isna().sum()
            info_dict["non_null_values"] = data[column].count()
            info_dict["null_%"] = round(data[column].isna().sum() / len(data), 3)*100

        except:
            print(f"unable to read column: {column}, you may want to drop this column")

        data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)

    data_profiling_df=data_profiling_df[["column_name", "column_type", "null_values", "null_%", 
                                         "non_null_values", "unique_values", "duplicate_values"]]
                                         
    data_profiling_df.sort_values(by = ["null_values"],
                                  ascending = [False],
                                  inplace=True)
    
    return data_profiling_df

In [12]:
start_time = time.time()
profiling_data1 = create_data_profiling_df(data = data1)
profiling_data2 = create_data_profiling_df(data = data2)
end_time = time.time()

display(profiling_data1)
display(profiling_data2)
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

unable to read column: location, you may want to drop this column


Unnamed: 0,column_name,column_type,null_values,null_%,non_null_values,unique_values,duplicate_values
28,vehicle_type_code_5,object,1933165.0,99.6,7850.0,64.0,7787.0
27,contributing_factor_vehicle_5,object,1932935.0,99.6,8080.0,30.0,8051.0
26,vehicle_type_code_4,object,1911814.0,98.5,29201.0,91.0,29111.0
25,contributing_factor_vehicle_4,object,1910846.0,98.4,30169.0,40.0,30130.0
24,vehicle_type_code_3,object,1809619.0,93.2,131396.0,230.0,131167.0
23,contributing_factor_vehicle_3,object,1805140.0,93.0,135875.0,52.0,135824.0
22,cross_street_name,object,1629723.0,84.0,311292.0,200429.0,110864.0
3,off_street_name,object,715266.0,36.9,1225749.0,19692.0,1206058.0
18,zip_code,object,602681.0,31.0,1338334.0,234.0,1338101.0
17,borough,object,602447.0,31.0,1338568.0,6.0,1338563.0


Unnamed: 0,column_name,column_type,null_values,null_%,non_null_values,unique_values,duplicate_values
20,contributing_factor_2,object,4779014,98.6,69680,51,69630
19,contributing_factor_1,object,4778919,98.6,69775,54,69722
18,ped_action,object,4777742,98.5,70952,17,70936
17,ped_location,object,4777641,98.5,71053,5,71049
11,ejection,object,2366135,48.8,2482559,7,2482553
15,safety_equipment,object,2365860,48.8,2482834,19,2482816
14,position_in_vehicle,object,2365788,48.8,2482906,12,2482895
12,emotional_status,object,2299333,47.4,2549361,9,2549353
13,bodily_injury,object,2299290,47.4,2549404,15,2549390
16,complaint,object,2299283,47.4,2549411,22,2549390




function took 104.1 seconds


------------------------------

## Data Merging

In [13]:
start_time = time.time()
overall_df = pd.merge(data1, data2, on="collision_id")
end_time = time.time()

display(overall_df)

print("\n")
overall_df.info()
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

Unnamed: 0,crash_date_x,crash_time_x,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,ejection,emotional_status,bodily_injury,position_in_vehicle,safety_equipment,complaint,ped_location,ped_action,contributing_factor_1,contributing_factor_2
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,Not Ejected,Conscious,Back,Driver,Lap Belt,Complaint of Pain or Nausea,,,,
1,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
2,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,Not Ejected,Conscious,Back,"Front passenger, if two or more persons, inclu...",Lap Belt,Complaint of Pain or Nausea,,,,
4,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,Not Ejected,Conscious,Shoulder - Upper Arm,Driver,Lap Belt & Harness,Complaint of Pain or Nausea,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4848656,2022-10-29T00:00:00.000,5:00,VANWYCK EXPRESSWAY,89 AVENUE,0,0,0,0,0,0,...,,,,,,,,,,
4848657,2022-10-29T00:00:00.000,5:00,VANWYCK EXPRESSWAY,89 AVENUE,0,0,0,0,0,0,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt,Does Not Apply,,,,
4848658,2022-10-29T00:00:00.000,5:00,VANWYCK EXPRESSWAY,89 AVENUE,0,0,0,0,0,0,...,,,,,,,,,,
4848659,2022-10-29T00:00:00.000,5:00,VANWYCK EXPRESSWAY,89 AVENUE,0,0,0,0,0,0,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt,Does Not Apply,,,,




<class 'pandas.core.frame.DataFrame'>
Int64Index: 4848661 entries, 0 to 4848660
Data columns (total 49 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date_x                   object
 1   crash_time_x                   object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code            

------------------------------

## Data Cleaning

In [14]:
start_time = time.time()

overall_df.drop(["crash_date_y", "crash_time_y", "location"], axis = 1, inplace=True)
overall_df.rename(columns = {'crash_date_x':'crash_date',
                             'crash_time_x':'crash_time'}, inplace = True)

overall_df["crash_date"] = pd.to_datetime(data1["crash_date"]).dt.date
overall_df["crash_time"] = pd.to_datetime(data1["crash_time"], format="%H:%M").dt.time

end_time = time.time()

overall_df.info()
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4848661 entries, 0 to 4848660
Data columns (total 46 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [15]:
# dropping null values
overall_df = overall_df.dropna(subset=['borough', 'zip_code', 'longitude', 'latitude',
                                       'person_sex', 'person_age', 'crash_date', 
                                       'crash_time', 'number_of_persons_injured', 
                                       'number_of_persons_killed',
                                       'person_type']).reset_index(drop=True)
overall_df.shape

(965420, 46)

In [16]:
start_time = time.time()
profiling_data = create_data_profiling_df(data = overall_df)
end_time = time.time()

display(profiling_data)
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

Unnamed: 0,column_name,column_type,null_values,null_%,non_null_values,unique_values,duplicate_values
27,vehicle_type_code_5,object,954475,98.9,10945,33,10913
26,contributing_factor_vehicle_5,object,954242,98.8,11178,7,11172
45,contributing_factor_2,object,940616,97.4,24804,49,24756
44,contributing_factor_1,object,940588,97.4,24832,45,24788
43,ped_action,object,940576,97.4,24844,17,24828
42,ped_location,object,940553,97.4,24867,5,24863
25,vehicle_type_code_4,object,933636,96.7,31784,47,31738
24,contributing_factor_vehicle_4,object,932759,96.6,32661,16,32646
23,vehicle_type_code_3,object,861173,89.2,104247,117,104131
22,contributing_factor_vehicle_3,object,857552,88.8,107868,34,107835




function took 30.5 seconds


------------------------------

## Creating Dimensions

### location_dim

In [17]:
location_dim = overall_df[["borough", "zip_code", "longitude", "latitude"]].reset_index(drop=True)
location_dim.insert(0, 'location_id', range(1, 1 + len(location_dim)))

display(location_dim)
print("\n")
location_dim.info()

Unnamed: 0,location_id,borough,zip_code,longitude,latitude
0,1,BROOKLYN,11208,-73.8665,40.667202
1,2,BROOKLYN,11208,-73.8665,40.667202
2,3,BROOKLYN,11233,-73.917274,40.683304
3,4,BRONX,10475,-73.83148,40.86816
4,5,BRONX,10475,-73.83148,40.86816
...,...,...,...,...,...
965415,965416,BRONX,10469,-73.8382340,40.8688130
965416,965417,MANHATTAN,10018,-73.9914800,40.7549500
965417,965418,MANHATTAN,10018,-73.9914800,40.7549500
965418,965419,MANHATTAN,10018,-73.9914800,40.7549500




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965420 entries, 0 to 965419
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   location_id  965420 non-null  int64 
 1   borough      965420 non-null  object
 2   zip_code     965420 non-null  object
 3   longitude    965420 non-null  object
 4   latitude     965420 non-null  object
dtypes: int64(1), object(4)
memory usage: 36.8+ MB


### demographics_dim

In [18]:
demographics_dim = overall_df[["person_sex", "person_age", 
                               "person_type"]].reset_index(drop=True)
demographics_dim.insert(0, 'demographics_id', range(1, 1 + len(demographics_dim)))

display(demographics_dim)
print("\n")
demographics_dim.info()

Unnamed: 0,demographics_id,person_sex,person_age,person_type
0,1,F,28,Occupant
1,2,F,28,Occupant
2,3,M,46,Pedestrian
3,4,F,41,Occupant
4,5,F,41,Occupant
...,...,...,...,...
965415,965416,F,26,Occupant
965416,965417,M,55,Occupant
965417,965418,M,55,Occupant
965418,965419,M,28,Occupant




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965420 entries, 0 to 965419
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   demographics_id  965420 non-null  int64 
 1   person_sex       965420 non-null  object
 2   person_age       965420 non-null  object
 3   person_type      965420 non-null  object
dtypes: int64(1), object(3)
memory usage: 29.5+ MB


### date_dim

In [19]:
start_time = time.time()
date_dim = overall_df[["crash_date"]]
# creating date_id from crash_date
date_dim['date_id'] = overall_df['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d"))
end_time = time.time()

print(f"function took {round(end_time - start_time, 1)} seconds")

function took 69.4 seconds


In [20]:
date_dim['crash_date']= pd.to_datetime(date_dim['crash_date'])
date_dim['day_of_week'] = date_dim['crash_date'].dt.day_name()
date_dim['month'] = date_dim['crash_date'].dt.month
date_dim['year'] = date_dim['crash_date'].dt.year
date_dim['quarter'] = date_dim['crash_date'].dt.to_period('Q')

# rearranging columns
date_dim = date_dim[["date_id", "crash_date", "day_of_week", "month",
                    "year", "quarter"]]

display(date_dim)
print("\n")
date_dim.info()

Unnamed: 0,date_id,crash_date,day_of_week,month,year,quarter
0,20211213,2021-12-13,Monday,12,2021,2021Q4
1,20211214,2021-12-14,Tuesday,12,2021,2021Q4
2,20211214,2021-12-14,Tuesday,12,2021,2021Q4
3,20211214,2021-12-14,Tuesday,12,2021,2021Q4
4,20211211,2021-12-11,Saturday,12,2021,2021Q4
...,...,...,...,...,...,...
965415,20221029,2022-10-29,Saturday,10,2022,2022Q4
965416,20221029,2022-10-29,Saturday,10,2022,2022Q4
965417,20221029,2022-10-29,Saturday,10,2022,2022Q4
965418,20221023,2022-10-23,Sunday,10,2022,2022Q4




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965420 entries, 0 to 965419
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date_id      965420 non-null  object        
 1   crash_date   965420 non-null  datetime64[ns]
 2   day_of_week  965420 non-null  object        
 3   month        965420 non-null  int64         
 4   year         965420 non-null  int64         
 5   quarter      965420 non-null  period[Q-DEC] 
dtypes: datetime64[ns](1), int64(2), object(2), period[Q-DEC](1)
memory usage: 44.2+ MB


### time_dim

In [21]:
# needed initial columns
time_dim = overall_df[["crash_date", "crash_time"]]

# making "day" column
time_dim['crash_date']= pd.to_datetime(time_dim['crash_date'])
time_dim['day'] = time_dim['crash_date'].dt.day_name()

# making "hour" and "am_pm_flag" columns 
## first convert to type string
time_dim['crash_time'] = time_dim['crash_time'].astype(str)

time_dim['hour'] = pd.to_datetime(time_dim['crash_time']).dt.hour
time_dim['am_pm_flag'] = np.where(time_dim['hour']<=12, 'AM', 'PM')

# creating id column
time_dim['temp1'] = time_dim['crash_date'].apply(lambda x: x.strftime("%Y%m%d"))

## first convert to type string and use it to help create time_id column
time_dim['crash_date'] = time_dim['crash_date'].astype(str)
time_dim['time_id'] = time_dim[['temp1', 'crash_time']].agg('-'.join, axis=1)

## revert back to datetime.time and dropping unecessary columns
time_dim["crash_time"] = pd.to_datetime(time_dim["crash_time"], format="%H:%M:%S").dt.time
time_dim.drop(["crash_date", "temp1"], axis = 1, inplace=True)

# rearranging columns
time_dim = time_dim[["time_id", "crash_time", "day", "hour", "am_pm_flag"]]

display(time_dim)
print("\n")
time_dim.info()

Unnamed: 0,time_id,crash_time,day,hour,am_pm_flag
0,20211213-00:34:00,00:34:00,Monday,0,AM
1,20211214-16:50:00,16:50:00,Tuesday,16,PM
2,20211214-08:30:00,08:30:00,Tuesday,8,AM
3,20211214-14:30:00,14:30:00,Tuesday,14,PM
4,20211211-04:45:00,04:45:00,Saturday,4,AM
...,...,...,...,...,...
965415,20221029-13:45:00,13:45:00,Saturday,13,PM
965416,20221029-08:00:00,08:00:00,Saturday,8,AM
965417,20221029-14:10:00,14:10:00,Saturday,14,PM
965418,20221023-17:00:00,17:00:00,Sunday,17,PM




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965420 entries, 0 to 965419
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_id     965420 non-null  object
 1   crash_time  965420 non-null  object
 2   day         965420 non-null  object
 3   hour        965420 non-null  int64 
 4   am_pm_flag  965420 non-null  object
dtypes: int64(1), object(4)
memory usage: 36.8+ MB


------------------------------

## Creating Fact Tables

### collision_occurrence

In [22]:
collision_occurrence = overall_df[["crash_date", "crash_time", "number_of_persons_injured",
                                       "number_of_persons_killed"]]

collision_occurrence['date_id'] = collision_occurrence['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d"))

# creating time_id column
collision_occurrence['crash_time'] = collision_occurrence['crash_time'].astype(str)
collision_occurrence['temp1'] = collision_occurrence['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d"))

## first convert to type string and use it to help create time_id column
collision_occurrence['crash_date'] = collision_occurrence['crash_date'].astype(str)
collision_occurrence['time_id'] = collision_occurrence[['temp1',
                                                        'crash_time']].agg('-'.join, axis=1)

In [23]:
collision_occurrence.insert(0, 'location_id', range(1, 1 + len(collision_occurrence)))
collision_occurrence.drop("crash_date", axis = 1, inplace = True)

# rearranging columns
collision_occurrence = collision_occurrence[["date_id", "time_id", "location_id",
                                             "number_of_persons_injured",
                                             "number_of_persons_killed"]]

display(collision_occurrence)
print("\n")
collision_occurrence.info()

Unnamed: 0,date_id,time_id,location_id,number_of_persons_injured,number_of_persons_killed
0,20211213,20211213-00:34:00,1,0,0
1,20211214,20211214-16:50:00,2,0,0
2,20211214,20211214-08:30:00,3,0,0
3,20211214,20211214-14:30:00,4,2,0
4,20211211,20211211-04:45:00,5,2,0
...,...,...,...,...,...
965415,20221029,20221029-13:45:00,965416,1,0
965416,20221029,20221029-08:00:00,965417,0,0
965417,20221029,20221029-14:10:00,965418,0,0
965418,20221023,20221023-17:00:00,965419,0,0




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965420 entries, 0 to 965419
Data columns (total 5 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   date_id                    965420 non-null  object
 1   time_id                    965420 non-null  object
 2   location_id                965420 non-null  int64 
 3   number_of_persons_injured  965420 non-null  object
 4   number_of_persons_killed   965420 non-null  object
dtypes: int64(1), object(4)
memory usage: 36.8+ MB


### person_collision

In [24]:
person_collision = overall_df[["crash_date", "person_sex", "person_age",
                               "person_type", "crash_time"]]

person_collision['date_id'] = person_collision['crash_date'].apply(lambda x: x.strftime("%Y%m%d"))

# creating time_id column
person_collision['crash_time'] = person_collision['crash_time'].astype(str)
person_collision['temp1'] = person_collision['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d"))

## first convert to type string and use it to help create time_id column
person_collision['crash_date'] = person_collision['crash_date'].astype(str)
person_collision['time_id'] = person_collision[['temp1', 
                                                'crash_time']].agg('-'.join, axis=1)

# dropping unecessary columns and rearranging columns
person_collision.drop(["crash_date", "temp1", "crash_time"], axis = 1, inplace=True)
person_collision = person_collision[["date_id", "time_id", "person_age", "person_sex",
                                     "person_type"]]

display(person_collision)
print("\n")
person_collision.info()

Unnamed: 0,date_id,time_id,person_age,person_sex,person_type
0,20211213,20211213-00:34:00,28,F,Occupant
1,20211214,20211214-16:50:00,28,F,Occupant
2,20211214,20211214-08:30:00,46,M,Pedestrian
3,20211214,20211214-14:30:00,41,F,Occupant
4,20211211,20211211-04:45:00,41,F,Occupant
...,...,...,...,...,...
965415,20221029,20221029-13:45:00,26,F,Occupant
965416,20221029,20221029-08:00:00,55,M,Occupant
965417,20221029,20221029-14:10:00,55,M,Occupant
965418,20221023,20221023-17:00:00,28,M,Occupant




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965420 entries, 0 to 965419
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   date_id      965420 non-null  object
 1   time_id      965420 non-null  object
 2   person_age   965420 non-null  object
 3   person_sex   965420 non-null  object
 4   person_type  965420 non-null  object
dtypes: object(5)
memory usage: 36.8+ MB


------------------------------

## Uploading to Google BigQuery