## Libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
from google.cloud import bigquery
from google.oauth2 import service_account
from tqdm.notebook import tqdm_notebook
import time

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

------------------------------

## NYC Open Data Variables

In [2]:
# setup the host name for the API endpoint (the https:// part will be added automatically)
# only need to change this if you are not using NYC Open Data
data_url = 'data.cityofnewyork.us'

In [3]:
# setup the data set at the API endpoint (NYC Ferry Ridership data in this case)
# For example: https://data.cityofnewyork.us/resource/t5n6-gx8c.json
# would give us 'erm2-nwe9'
data_set1 = 'h9gi-nx95'
data_set2 = 'f55k-p6yu'

In [4]:
# Setup your App Token, which you created in Week 6
# You can find your app token by logging into: 
# https://data.cityofnewyork.us/profile/edit/developer_settings
#app_token = 'your app token here'
app_token = ''

In [5]:
# run this cell to setup your Socrata client that connects python to NYC Open Data

# create the client that points to the API endpoint
nyc_open_data_client = Socrata(data_url, app_token, timeout = 200)
print(f"nyc open data client name is: {nyc_open_data_client}")
print(f"nyc open data client data type is: {type(nyc_open_data_client)}")

nyc open data client name is: <sodapy.socrata.Socrata object at 0x0000025D0D8036D0>
nyc open data client data type is: <class 'sodapy.socrata.Socrata'>


------------------------------

## Data Extraction

In [6]:
# Get the total number of records in our the entire data set

#data_set1
total_record_count = nyc_open_data_client.get(data_set1, select = "COUNT(*)")
print(f"total records in data_set1, {data_set1}: {total_record_count[0]['COUNT']}")

#data_set2
total_record_count2 = nyc_open_data_client.get(data_set2, select = "COUNT(*)")
print(f"total records in data_set2, {data_set2}: {total_record_count2[0]['COUNT']}")

total records in data_set1, h9gi-nx95: 1940736
total records in data_set2, f55k-p6yu: 4847694


In [7]:
# Now, loop through target data set to pull all rows in chunks (we cannot pull all rows at once)
# AGAIN, UPDATE WHERE FILTER INSIDE BELOW FUNCTION

def extract_socrata_data(data_set,
                         chunk_size = 50000,
                         where = None):
    
    # measure time this function takes
    import time
    start_time1 = time.time()
    
    # get total number or records
    if where == None:
        total_records = int(nyc_open_data_client.get(data_set,
                                                     select= "COUNT(*)")[0]["COUNT"])
    else:
        total_records = int(nyc_open_data_client.get(data_set,
                                                     where = where,
                                                     select= "COUNT(*)")[0]["COUNT"])
    
    # start at 0, empty list for results
    start = 0                   
    results = []   
 
    # for progress bar
    pbar = tqdm_notebook(desc = 'while loop', total = total_records)

    while True:

        if where == None:
            # fetch the set of records starting at 'start'
            results.extend(nyc_open_data_client.get(data_set,
                                                    offset = start,
                                                    limit = chunk_size))
            
        elif where != None:
            results.extend(nyc_open_data_client.get(data_set,
                                                    where = where,
                                                    offset = start,
                                                    limit = chunk_size))
        # update the starting record number
        start = start + chunk_size
        
        # Update progress bar
        print(start, end ='\r')
        pbar.update(chunk_size)

        # if we have fetched all of the records (we have reached total_records), exit loop
        if (start > total_records):
        #if (start > 50000):
            print("Loop completed")
            #close progress bar
            pbar.close()
            break

    # convert the list into a pandas data frame
    end_time1 = time.time()
    print(f"Loop took {round(end_time1 - start_time1, 1)} seconds")
    
    start_time2 = time.time()
    
    data = pd.DataFrame.from_records(results)

    end_time2 = time.time()
    print(f"Transforming to pandas.DataFrame took {round(end_time2 - start_time2, 1)} seconds")

    print(f"The shape of your dataframe is: {data.shape}")
    return data

In [8]:
data1 = extract_socrata_data(chunk_size = 50000,
                             data_set = data_set1)

while loop:   0%|          | 0/1940736 [00:00<?, ?it/s]

Loop completed
Loop took 128.7 seconds
Transforming to pandas.DataFrame took 4.6 seconds
The shape of your dataframe is: (1940736, 29)


In [9]:
data2 = extract_socrata_data(data_set = data_set2,
                             chunk_size = 50000)

while loop:   0%|          | 0/4847694 [00:00<?, ?it/s]

Loop completed
Loop took 341.1 seconds
Transforming to pandas.DataFrame took 8.5 seconds
The shape of your dataframe is: (4847694, 21)


------------------------------

## Data Profiling

In [10]:
data1.info(verbose=True)
print("\n")
data2.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1940736 entries, 0 to 1940735
Data columns (total 29 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [11]:
# create and run a function to ceate data profiling dataframe

def create_data_profiling_df(data):
    
    # create an empty dataframe to gather information about each column
    data_profiling_df = pd.DataFrame(columns = ["column_name",
                                                "column_type",
                                                "unique_values",
                                                "duplicate_values",
                                                "null_values",
                                                "non_null_values"])

    # loop through each column to add rows to the data_profiling_df dataframe
    for column in data.columns:

        info_dict = {}

        try:
            info_dict["column_name"] = column
            info_dict["column_type"] = data[column].dtypes
            info_dict["unique_values"] = len(data[column].unique())
            info_dict["duplicate_values"] = data[column].count() - len(data[column].dropna().unique())
            info_dict["null_values"] = data[column].isna().sum()
            info_dict["non_null_values"] = data[column].count()
            info_dict["null_%"] = round(data[column].isna().sum() / len(data), 3)*100

        except:
            print(f"unable to read column: {column}, you may want to drop this column")

        data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)

    data_profiling_df=data_profiling_df[["column_name", "column_type", "null_values", "null_%", 
                                         "non_null_values", "unique_values", "duplicate_values"]]
                                         
    data_profiling_df.sort_values(by = ["null_values"],
                                  ascending = [False],
                                  inplace=True)
    
    return data_profiling_df

In [12]:
start_time = time.time()
profiling_data1 = create_data_profiling_df(data = data1)
profiling_data2 = create_data_profiling_df(data = data2)
end_time = time.time()

display(profiling_data1)
display(profiling_data2)
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

unable to read column: location, you may want to drop this column


Unnamed: 0,column_name,column_type,null_values,null_%,non_null_values,unique_values,duplicate_values
28,vehicle_type_code_5,object,1932888.0,99.6,7848.0,64.0,7785.0
27,contributing_factor_vehicle_5,object,1932658.0,99.6,8078.0,30.0,8049.0
26,vehicle_type_code_4,object,1911544.0,98.5,29192.0,91.0,29102.0
25,contributing_factor_vehicle_4,object,1910575.0,98.4,30161.0,40.0,30122.0
24,vehicle_type_code_3,object,1809377.0,93.2,131359.0,230.0,131130.0
23,contributing_factor_vehicle_3,object,1804903.0,93.0,135833.0,52.0,135782.0
22,cross_street_name,object,1629510.0,84.0,311226.0,200375.0,110852.0
3,off_street_name,object,715105.0,36.8,1225631.0,19692.0,1205940.0
18,zip_code,object,602561.0,31.0,1338175.0,234.0,1337942.0
17,borough,object,602327.0,31.0,1338409.0,6.0,1338404.0


Unnamed: 0,column_name,column_type,null_values,null_%,non_null_values,unique_values,duplicate_values
20,contributing_factor_2,object,4778044,98.6,69650,51,69600
19,contributing_factor_1,object,4777949,98.6,69745,54,69692
18,ped_action,object,4776772,98.5,70922,17,70906
17,ped_location,object,4776671,98.5,71023,5,71019
11,ejection,object,2365743,48.8,2481951,7,2481945
15,safety_equipment,object,2365467,48.8,2482227,19,2482209
14,position_in_vehicle,object,2365395,48.8,2482299,12,2482288
12,emotional_status,object,2298971,47.4,2548723,9,2548715
13,bodily_injury,object,2298928,47.4,2548766,15,2548752
16,complaint,object,2298921,47.4,2548773,22,2548752




function took 93.6 seconds


------------------------------

## Data Merging

In [13]:
start_time = time.time()
overall_df = pd.merge(data1, data2, on="collision_id")
end_time = time.time()

display(overall_df)

print("\n")
overall_df.info()
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

Unnamed: 0,crash_date_x,crash_time_x,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,ejection,emotional_status,bodily_injury,position_in_vehicle,safety_equipment,complaint,ped_location,ped_action,contributing_factor_1,contributing_factor_2
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,Not Ejected,Conscious,Back,Driver,Lap Belt,Complaint of Pain or Nausea,,,,
1,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
2,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,Not Ejected,Conscious,Back,"Front passenger, if two or more persons, inclu...",Lap Belt,Complaint of Pain or Nausea,,,,
4,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,Not Ejected,Conscious,Shoulder - Upper Arm,Driver,Lap Belt & Harness,Complaint of Pain or Nausea,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4847747,2022-10-26T00:00:00.000,15:00,EASTERN PARKWAY,BEDFORD AVENUE,0,0,0,0,0,0,...,,,,,,,,,,
4847748,2022-10-26T00:00:00.000,15:00,EASTERN PARKWAY,BEDFORD AVENUE,0,0,0,0,0,0,...,Not Ejected,Does Not Apply,Does Not Apply,Driver,Lap Belt & Harness,Does Not Apply,,,,
4847749,2022-10-26T00:00:00.000,15:00,EASTERN PARKWAY,BEDFORD AVENUE,0,0,0,0,0,0,...,,,,,,,,,,
4847750,2022-10-26T00:00:00.000,15:00,EASTERN PARKWAY,BEDFORD AVENUE,0,0,0,0,0,0,...,,,,,,,,,,




<class 'pandas.core.frame.DataFrame'>
Int64Index: 4847752 entries, 0 to 4847751
Data columns (total 49 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date_x                   object
 1   crash_time_x                   object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code            

------------------------------

## Data Cleaning

In [14]:
start_time = time.time()

overall_df.drop(["crash_date_y", "crash_time_y", "location"], axis = 1, inplace=True)
overall_df.rename(columns = {'crash_date_x':'crash_date',
                             'crash_time_x':'crash_time'}, inplace = True)

overall_df["crash_date"] = pd.to_datetime(data1["crash_date"]).dt.date
overall_df["crash_time"] = pd.to_datetime(data1["crash_time"], format="%H:%M").dt.time

end_time = time.time()

overall_df.info()
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4847752 entries, 0 to 4847751
Data columns (total 46 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [15]:
# dropping null values
overall_df = overall_df.dropna(subset=['borough', 'zip_code', 'longitude', 'latitude',
                                       'person_sex', 'person_age', 'crash_date', 
                                       'crash_time', 'number_of_persons_injured', 
                                       'number_of_persons_killed',
                                       'person_type']).reset_index(drop=True)
overall_df.shape

(965405, 46)

In [16]:
start_time = time.time()
profiling_data = create_data_profiling_df(data = overall_df)
end_time = time.time()

display(profiling_data)
print("\n")
print(f"function took {round(end_time - start_time, 1)} seconds")

Unnamed: 0,column_name,column_type,null_values,null_%,non_null_values,unique_values,duplicate_values
27,vehicle_type_code_5,object,954454,98.9,10951,33,10919
26,contributing_factor_vehicle_5,object,954223,98.8,11182,7,11176
45,contributing_factor_2,object,940598,97.4,24807,49,24759
44,contributing_factor_1,object,940570,97.4,24835,45,24791
43,ped_action,object,940558,97.4,24847,17,24831
42,ped_location,object,940535,97.4,24870,5,24866
25,vehicle_type_code_4,object,933623,96.7,31782,47,31736
24,contributing_factor_vehicle_4,object,932745,96.6,32660,16,32645
23,vehicle_type_code_3,object,861187,89.2,104218,117,104102
22,contributing_factor_vehicle_3,object,857564,88.8,107841,34,107808




function took 26.6 seconds


------------------------------

## Creating Dimensions

### location_dim

In [17]:
location_dim = overall_df[["borough", "zip_code", "longitude", "latitude"]].reset_index(drop=True)
location_dim.insert(0, 'location_id', range(1, 1 + len(location_dim)))

display(location_dim)
print("\n")
location_dim.info()

Unnamed: 0,location_id,borough,zip_code,longitude,latitude
0,1,BROOKLYN,11208,-73.8665,40.667202
1,2,BROOKLYN,11208,-73.8665,40.667202
2,3,BROOKLYN,11233,-73.917274,40.683304
3,4,BRONX,10475,-73.83148,40.86816
4,5,BRONX,10475,-73.83148,40.86816
...,...,...,...,...,...
965400,965401,BRONX,10460,-73.8781100,40.8388700
965401,965402,BRONX,10460,-73.8781100,40.8388700
965402,965403,BRONX,10460,-73.8781100,40.8388700
965403,965404,BRONX,10460,-73.8781100,40.8388700




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965405 entries, 0 to 965404
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   location_id  965405 non-null  int64 
 1   borough      965405 non-null  object
 2   zip_code     965405 non-null  object
 3   longitude    965405 non-null  object
 4   latitude     965405 non-null  object
dtypes: int64(1), object(4)
memory usage: 36.8+ MB


### demographics_dim

In [18]:
demographics_dim = overall_df[["person_sex", "person_age", 
                               "person_type"]].reset_index(drop=True)
demographics_dim.insert(0, 'demographics_id', range(1, 1 + len(demographics_dim)))

display(demographics_dim)
print("\n")
demographics_dim.info()

Unnamed: 0,demographics_id,person_sex,person_age,person_type
0,1,F,28,Occupant
1,2,F,28,Occupant
2,3,M,46,Pedestrian
3,4,F,41,Occupant
4,5,F,41,Occupant
...,...,...,...,...
965400,965401,M,46,Occupant
965401,965402,M,52,Occupant
965402,965403,M,52,Occupant
965403,965404,M,42,Occupant




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965405 entries, 0 to 965404
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   demographics_id  965405 non-null  int64 
 1   person_sex       965405 non-null  object
 2   person_age       965405 non-null  object
 3   person_type      965405 non-null  object
dtypes: int64(1), object(3)
memory usage: 29.5+ MB


### date_dim

In [19]:
start_time = time.time()
date_dim = overall_df[["crash_date"]]
# creating date_id from crash_date
date_dim['date_id'] = overall_df['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d"))
end_time = time.time()

print(f"function took {round(end_time - start_time, 1)} seconds")

function took 66.7 seconds


In [20]:
date_dim['crash_date']= pd.to_datetime(date_dim['crash_date'])
date_dim['day_of_week'] = date_dim['crash_date'].dt.day_name()
date_dim['month'] = date_dim['crash_date'].dt.month
date_dim['year'] = date_dim['crash_date'].dt.year
date_dim['quarter'] = date_dim['crash_date'].dt.to_period('Q')

# rearranging columns
date_dim = date_dim[["date_id", "crash_date", "day_of_week", "month",
                    "year", "quarter"]]

display(date_dim)
print("\n")
date_dim.info()

Unnamed: 0,date_id,crash_date,day_of_week,month,year,quarter
0,20211213,2021-12-13,Monday,12,2021,2021Q4
1,20211214,2021-12-14,Tuesday,12,2021,2021Q4
2,20211214,2021-12-14,Tuesday,12,2021,2021Q4
3,20211214,2021-12-14,Tuesday,12,2021,2021Q4
4,20211211,2021-12-11,Saturday,12,2021,2021Q4
...,...,...,...,...,...,...
965400,20221025,2022-10-25,Tuesday,10,2022,2022Q4
965401,20221028,2022-10-28,Friday,10,2022,2022Q4
965402,20221028,2022-10-28,Friday,10,2022,2022Q4
965403,20221026,2022-10-26,Wednesday,10,2022,2022Q4




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965405 entries, 0 to 965404
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   date_id      965405 non-null  object        
 1   crash_date   965405 non-null  datetime64[ns]
 2   day_of_week  965405 non-null  object        
 3   month        965405 non-null  int64         
 4   year         965405 non-null  int64         
 5   quarter      965405 non-null  period[Q-DEC] 
dtypes: datetime64[ns](1), int64(2), object(2), period[Q-DEC](1)
memory usage: 44.2+ MB


### time_dim

In [21]:
# needed initial columns
time_dim = overall_df[["crash_date", "crash_time"]]

# making "day" column
time_dim['crash_date']= pd.to_datetime(time_dim['crash_date'])
time_dim['day'] = time_dim['crash_date'].dt.day_name()

# making "hour" and "am_pm_flag" columns 
## first convert to type string
time_dim['crash_time'] = time_dim['crash_time'].astype(str)

time_dim['hour'] = pd.to_datetime(time_dim['crash_time']).dt.hour
time_dim['am_pm_flag'] = np.where(time_dim['hour']<=12, 'AM', 'PM')

# creating id column
time_dim['temp1'] = time_dim['crash_date'].apply(lambda x: x.strftime("%Y%m%d-"))

## first convert to type string and use it to help create time_id column
time_dim['crash_date'] = time_dim['crash_date'].astype(str)
time_dim['time_id'] = time_dim[['temp1', 'crash_time']].agg('-'.join, axis=1)

# revert back to datetime.time and dropping unecessary columns
time_dim["crash_time"] = pd.to_datetime(time_dim["crash_time"], format="%H:%M:%S").dt.time
time_dim.drop(["crash_date", "temp1"], axis = 1, inplace=True)

# rearranging columns
time_dim = time_dim[["time_id", "crash_time", "day", "hour", "am_pm_flag"]]

display(time_dim)
print("\n")
time_dim.info()

Unnamed: 0,time_id,crash_time,day,hour,am_pm_flag
0,20211213--00:34:00,00:34:00,Monday,0,AM
1,20211214--16:50:00,16:50:00,Tuesday,16,PM
2,20211214--08:30:00,08:30:00,Tuesday,8,AM
3,20211214--14:30:00,14:30:00,Tuesday,14,PM
4,20211211--04:45:00,04:45:00,Saturday,4,AM
...,...,...,...,...,...
965400,20221025--08:30:00,08:30:00,Tuesday,8,AM
965401,20221028--11:45:00,11:45:00,Friday,11,AM
965402,20221028--22:17:00,22:17:00,Friday,22,PM
965403,20221026--10:40:00,10:40:00,Wednesday,10,AM




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965405 entries, 0 to 965404
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_id     965405 non-null  object
 1   crash_time  965405 non-null  object
 2   day         965405 non-null  object
 3   hour        965405 non-null  int64 
 4   am_pm_flag  965405 non-null  object
dtypes: int64(1), object(4)
memory usage: 36.8+ MB


------------------------------

## Creating Fact Tables

### collision_occurence

In [22]:
collision_occurence_temp = overall_df[["crash_date", "number_of_persons_injured",
                                       "number_of_persons_killed"]]

collision_occurence_temp['date_id'] = collision_occurence_temp['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d"))

In [23]:
collision_occurence = date_dim[["date_id"]]
collision_occurence = pd.merge(collision_occurence, collision_occurence_temp, 
                               on="date_id")

In [24]:
collision_occurence["time_id"] = time_dim[["time_id"]]
collision_occurence["location_id"] = location_dim[["location_id"]]
collision_occurence = collision_occurence.dropna(subset=['time_id', 
                                                         'location_id']).reset_index(drop=True)
collision_occurence.drop("crash_date", axis = 1, inplace = True)

# rearranging columns
collision_occurence = collision_occurence[["date_id", "time_id", "location_id",
                                           "number_of_persons_injured",
                                           "number_of_persons_killed"]]

display(collision_occurence)
print("\n")
collision_occurence.info()

Unnamed: 0,date_id,time_id,location_id,number_of_persons_injured,number_of_persons_killed
0,20211213,20211213--00:34:00,1.0,0,0
1,20211213,20211214--16:50:00,2.0,2,0
2,20211213,20211214--08:30:00,3.0,0,0
3,20211213,20211214--14:30:00,4.0,0,0
4,20211213,20211211--04:45:00,5.0,0,0
...,...,...,...,...,...
965400,20190521,20221025--08:30:00,965401.0,1,0
965401,20190521,20221028--11:45:00,965402.0,1,0
965402,20190521,20221028--22:17:00,965403.0,0,0
965403,20190521,20221026--10:40:00,965404.0,1,0




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965405 entries, 0 to 965404
Data columns (total 5 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   date_id                    965405 non-null  object 
 1   time_id                    965405 non-null  object 
 2   location_id                965405 non-null  float64
 3   number_of_persons_injured  965405 non-null  object 
 4   number_of_persons_killed   965405 non-null  object 
dtypes: float64(1), object(4)
memory usage: 36.8+ MB


### person_collision

In [25]:
person_collision = overall_df[["crash_date", "person_sex", "person_age",
                               "person_type", "crash_time"]]

person_collision['date_id'] = person_collision['crash_date'].apply(lambda x: x.strftime("%Y%m%d-"))

# creating time_id column
person_collision['crash_time'] = person_collision['crash_time'].astype(str)
person_collision['temp1'] = person_collision['crash_date'].apply(lambda x: pd.to_datetime(x).strftime("%Y%m%d-"))

## first convert to type string and use it to help create time_id column
person_collision['crash_date'] = person_collision['crash_date'].astype(str)
person_collision['time_id'] = person_collision[['temp1', 
                                                'crash_time']].agg('-'.join, axis=1)

# dropping unecessary columns and rearranging columns
person_collision.drop(["crash_date", "temp1", "crash_time"], axis = 1, inplace=True)
person_collision = person_collision[["date_id", "time_id", "person_age", "person_sex",
                                     "person_type"]]

display(person_collision)
print("\n")
person_collision.info()

Unnamed: 0,date_id,time_id,person_age,person_sex,person_type
0,20211213-,20211213--00:34:00,28,F,Occupant
1,20211214-,20211214--16:50:00,28,F,Occupant
2,20211214-,20211214--08:30:00,46,M,Pedestrian
3,20211214-,20211214--14:30:00,41,F,Occupant
4,20211211-,20211211--04:45:00,41,F,Occupant
...,...,...,...,...,...
965400,20221025-,20221025--08:30:00,46,M,Occupant
965401,20221028-,20221028--11:45:00,52,M,Occupant
965402,20221028-,20221028--22:17:00,52,M,Occupant
965403,20221026-,20221026--10:40:00,42,M,Occupant




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965405 entries, 0 to 965404
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   date_id      965405 non-null  object
 1   time_id      965405 non-null  object
 2   person_age   965405 non-null  object
 3   person_sex   965405 non-null  object
 4   person_type  965405 non-null  object
dtypes: object(5)
memory usage: 36.8+ MB


------------------------------

## Uploading to Google BigQuery