### Star Schema

In [1]:
from PIL import Image
import requests
import matplotlib.pyplot as plt

# Corrigir import do star schema
url = 'https://i.imgur.com/1prxQ4H.png'
response = requests.get(url, stream=True)
img = Image.open(response.raw)

plt.imshow(img)
plt.show()

<Figure size 640x480 with 1 Axes>

### Importação dos datasets relevantes ao problema

Datasets a importar:
* US Accidents
* TMC's e eventos associados
* Calendário com fases da lua (\*)
* Limite máximo de velocidade por estado
* Dados de consumo de álcool por estado
* Quantidade de automóveis registados por estado

In [2]:
import numpy as np
import pandas as pd
import re
import datetime as dt

##### Importação do dataset US Accidents

In [3]:
def generate_sample(filename, s):
    n = np.sum([1 for _ in open(filename)]) - 1 # number of records in file (excludes header)
    indices = np.arange(1, n + 1)
    skip = np.sort(np.random.choice(indices, size=n - s, replace=False)) # random indices to skip

    return pd.read_csv(filename, skiprows=skip)

In [4]:
# import US accidents dataset
#us_acidents = pd.read_csv('US_Accidents_Dec19.csv')
us_acidents = generate_sample('US_Accidents_Dec19.csv', 50000)

us_acidents.head(5)

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-205,MapQuest,406.0,3,2016-02-16 16:47:00,2016-02-16 21:00:00,40.016014,-82.903442,,,...,False,False,False,False,False,False,Day,Day,Day,Day
1,A-213,MapQuest,201.0,3,2016-02-17 07:22:56,2016-02-17 07:52:56,41.063995,-81.572945,,,...,False,False,False,False,False,False,Day,Day,Day,Day
2,A-266,MapQuest,201.0,2,2016-02-18 15:46:33,2016-02-18 16:31:33,39.759544,-84.191811,,,...,False,False,False,False,False,False,Day,Day,Day,Day
3,A-308,MapQuest-Bing,201.0,3,2016-02-22 07:24:22,2016-02-22 07:54:22,39.933804,-82.789444,,,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-311,MapQuest,201.0,2,2016-02-22 07:51:22,2016-02-22 08:21:22,39.772438,-84.142181,,,...,False,False,False,False,False,False,Day,Day,Day,Day


##### Importação do dataset relativo a TMCs / Criação da Dimensão TMC

In [5]:
# import event codes
event_code = pd.read_csv('event_code_tmc.csv', delimiter=';')
event_code = event_code.rename(columns={"Code": "TMC_Key", "Description":"Event"})

# add out-of-range key
event_code = event_code.append({'TMC_Key': -1, 'Event': 'unidentified'}, ignore_index=True)
event_code.head(5)

Unnamed: 0,TMC_Key,Event
0,1,traffic problem
1,2,queuing traffic (with average speeds Q). Dange...
2,11,overheight warning system triggered
3,12,"(Q) accident(s), traffic being directed around..."
4,16,"closed, rescue and recovery work in progress"


##### Importação do dataset sobre consumo de álcool

In [6]:
def discretize_rate(x):
    if x <= 2.0:
        return 'Low'
    elif 2.0 < x <= 2.5:
        return 'Medium'
    elif 2.5 < x <= 3.5:
        return 'High'
    else:
        return 'Very High'

# import alcohool consumption rates by state
alcohol_consumption = pd.read_csv('alcohol_consumption_state.csv', delimiter=';')

alcohol_consumption.at[20, "State"] = "Massachusetts"

# sort and reset index
alcohol_consumption = alcohol_consumption.sort_values(by=['rate'], ascending=False).reset_index(drop=True)

alcohol_consumption['DiscreteRate'] = alcohol_consumption['rate'].map(discretize_rate)

alcohol_dict = pd.Series(alcohol_consumption['DiscreteRate'].values, index=alcohol_consumption['State']).to_dict()
alcohol_dict

{'New Hampshire': 'Very High',
 'Delaware': 'Very High',
 'Nevada': 'High',
 'North Dakota': 'High',
 'Montana': 'High',
 'Vermont': 'High',
 'Wisconsin': 'High',
 'Idaho': 'High',
 'South Dakota': 'High',
 'Colorado': 'High',
 'Maine': 'High',
 'Alaska': 'High',
 'Minnesota': 'High',
 'Oregon': 'High',
 'Wyoming': 'High',
 'Hawaii': 'High',
 'Florida': 'High',
 'Rhode Island': 'High',
 'Massachusetts': 'High',
 'Louisiana': 'High',
 'Missouri': 'High',
 'Connecticut': 'Medium',
 'Iowa': 'Medium',
 'Illinois': 'Medium',
 'Texas': 'Medium',
 'Pennsylvania': 'Medium',
 'Michigan': 'Medium',
 'New Jersey': 'Medium',
 'California': 'Medium',
 'Arizona': 'Medium',
 'Washington': 'Medium',
 'Nebraska': 'Medium',
 'Mississippi': 'Medium',
 'New York': 'Medium',
 'South Carolina': 'Medium',
 'New Mexico': 'Medium',
 'North Carolina': 'Medium',
 'Maryland': 'Medium',
 'Virginia': 'Medium',
 'Indiana': 'Medium',
 'Tennessee': 'Medium',
 'Ohio': 'Medium',
 'Alabama': 'Low',
 'Kentucky': 'Low',
 '

##### Importação do dataset sobre registos automóveis

In [7]:
def discretize_total(x):
    if x <= 5000000:
        return 'Low'
    elif 5000000 < x <= 10000000:
        return 'Medium'
    elif 10000000 < x <= 20000000:
        return 'High'
    else:
        return 'Very High'

# import vehicle registrations by state
vehicle_registration = pd.read_csv('vehicle_registrations_usa.csv', delimiter=';')
# filter out non States
vehicle_registration = vehicle_registration[vehicle_registration['State'] != 'Dist. of Col.']

vehicle_registration['Total'] = vehicle_registration['Total'].map(lambda x: int(re.sub(' ', '', x)))

# sort and reset index
vehicle_registration = vehicle_registration.sort_values(by=['Total'], ascending=False).reset_index(drop=True)

vehicle_registration['DiscreteTotal'] = vehicle_registration['Total'].map(discretize_total)

vehicle_dict = pd.Series(vehicle_registration['DiscreteTotal'].values, index=vehicle_registration['State']).to_dict()
vehicle_dict

{'California ': 'Very High',
 'Texas': 'Very High',
 'Florida': 'High',
 'New York': 'High',
 'Ohio': 'High',
 'Pennsylvania': 'High',
 'Illinois': 'High',
 'Georgia': 'Medium',
 'Michigan': 'Medium',
 'North Carolina': 'Medium',
 'Virginia': 'Medium',
 'Washington': 'Medium',
 'Indiana': 'Medium',
 'New Jersey': 'Medium',
 'Arizona': 'Medium',
 'Tennessee': 'Medium',
 'Wisconsin': 'Medium',
 'Missouri': 'Medium',
 'Minnesota': 'Medium',
 'Colorado': 'Medium',
 'Alabama': 'Medium',
 'Massachusetts': 'Medium',
 'South Carolina': 'Low',
 'Kentucky': 'Low',
 'Maryland': 'Low',
 'Oregon': 'Low',
 'Louisiana': 'Low',
 'Oklahoma': 'Low',
 'Iowa': 'Low',
 'Connecticut ': 'Low',
 'Arkansas': 'Low',
 'Kansas': 'Low',
 'Nevada': 'Low',
 'Utah': 'Low',
 'Mississippi': 'Low',
 'Nebraska': 'Low',
 'Idaho': 'Low',
 'Montana': 'Low',
 'New Mexico': 'Low',
 'West Virginia': 'Low',
 'New Hampshire': 'Low',
 'South Dakota': 'Low',
 'Hawaii': 'Low',
 'Maine': 'Low',
 'Delaware': 'Low',
 'North Dakota': '

##### Importação do dataset sobre nível de urbanização

In [8]:
# import rural/urban information about counties
urban = pd.read_excel('NCHSURCodes2013.xlsx')

# drop unwanted information
urban = urban.drop(['State Abr.','CBSA title', 'CBSA 2012 pop', 'County 2012 pop'], axis=1)

# remove County from name
urban['County name'] = urban['County name'].map(lambda x: re.sub(' County', '', x))

urban_term = {1:'Large Central Metro',
              2:'Large Fringe Metro',
              3:'Medium Metro',
              4:'Small Metro',
              5:'Micropolitan',
              6:'Non-Core'}

urban["2013 code"] = urban["2013 code"].map(urban_term)

urban_dict = pd.Series(urban['2013 code'].values, index=urban['County name']).to_dict()
urban_dict

{'Autauga': 'Medium Metro',
 'Baldwin': 'Micropolitan',
 'Barbour': 'Non-Core',
 'Bibb': 'Small Metro',
 'Blount': 'Medium Metro',
 'Bullock': 'Non-Core',
 'Butler': 'Large Fringe Metro',
 'Calhoun': 'Non-Core',
 'Chambers': 'Large Fringe Metro',
 'Cherokee': 'Micropolitan',
 'Chilton': 'Large Fringe Metro',
 'Choctaw': 'Non-Core',
 'Clarke': 'Large Fringe Metro',
 'Clay': 'Small Metro',
 'Cleburne': 'Non-Core',
 'Coffee': 'Micropolitan',
 'Colbert': 'Small Metro',
 'Conecuh': 'Non-Core',
 'Coosa': 'Micropolitan',
 'Covington': 'Non-Core',
 'Crenshaw': 'Non-Core',
 'Cullman': 'Micropolitan',
 'Dale': 'Micropolitan',
 'Dallas': 'Large Central Metro',
 'DeKalb': 'Non-Core',
 'Elmore': 'Micropolitan',
 'Escambia': 'Medium Metro',
 'Etowah': 'Small Metro',
 'Fayette': 'Small Metro',
 'Franklin': 'Medium Metro',
 'Geneva': 'Small Metro',
 'Greene': 'Small Metro',
 'Hale': 'Micropolitan',
 'Henry': 'Micropolitan',
 'Houston': 'Non-Core',
 'Jackson': 'Non-Core',
 'Jefferson': 'Micropolitan',


##### Importação do dataset sobre limites de velocidade

In [9]:
def discretize_speed(x):
    if x < 70:
        return 'Low'
    elif 70 <= x < 80:
        return 'Medium'
    else:
        return 'High'

# import maximum speed limits by state
speed_limits = pd.read_excel('speed_limit_state.xlsx')

# drop unwanted information
speed_limits = speed_limits.drop(['Freeway (trucks)','Freeway (urban)','Divided (rural)','Undivided (rural)','Residential'], axis=1)

speed_limits = speed_limits.rename(columns={'State or territory': 'State', 'Freeway (rural)': 'Max Speed Limit (mph)'})

# List of all the US States
us_states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
             'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois',
             'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
             'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
             'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
             'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
             'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas','Utah',
             'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

# cleaning, avoid applying the function to missing values
speed_limits['State'] = speed_limits['State'].map(lambda x: re.sub('[^a-z A-Z]+', '', x), na_action='ignore')
speed_limits['Max Speed Limit (mph)'] = speed_limits['Max Speed Limit (mph)'].map(lambda x: re.sub('\(.*', '', x), na_action='ignore')

# filter out non US States
speed_limits = speed_limits[speed_limits['State'].isin(us_states)]

# more cleaning
speed_limits['Max Speed Limit (mph)'] = speed_limits['Max Speed Limit (mph)'].map(lambda x: int(x[:-4][-3:]))
speed_limits.at[41, 'Max Speed Limit (mph)'] = 70

# sort and reset index
speed_limits = speed_limits.sort_values(by=['Max Speed Limit (mph)'], ascending=False).reset_index(drop=True)

speed_limits['Discrete Max Speed Limit'] = speed_limits['Max Speed Limit (mph)'].map(discretize_speed)

speed_dict = pd.Series(speed_limits['Discrete Max Speed Limit'].values, index=speed_limits['State']).to_dict()
speed_dict

{'Texas': 'High',
 'Montana': 'High',
 'Idaho': 'High',
 'Utah': 'High',
 'South Dakota': 'High',
 'Nevada': 'High',
 'Wyoming': 'High',
 'Maine': 'Medium',
 'Michigan': 'Medium',
 'North Dakota': 'Medium',
 'Colorado': 'Medium',
 'Nebraska': 'Medium',
 'Arizona': 'Medium',
 'Arkansas': 'Medium',
 'Kansas': 'Medium',
 'Oklahoma': 'Medium',
 'Louisiana': 'Medium',
 'New Mexico': 'Medium',
 'Ohio': 'Medium',
 'Oregon': 'Medium',
 'Pennsylvania': 'Medium',
 'New Hampshire': 'Medium',
 'South Carolina': 'Medium',
 'California': 'Medium',
 'Tennessee': 'Medium',
 'Virginia': 'Medium',
 'Washington': 'Medium',
 'West Virginia': 'Medium',
 'Wisconsin': 'Medium',
 'North Carolina': 'Medium',
 'Alabama': 'Medium',
 'Maryland': 'Medium',
 'Indiana': 'Medium',
 'Florida': 'Medium',
 'Georgia': 'Medium',
 'Missouri': 'Medium',
 'Mississippi': 'Medium',
 'Minnesota': 'Medium',
 'Illinois': 'Medium',
 'Kentucky': 'Medium',
 'Iowa': 'Medium',
 'Connecticut': 'Low',
 'Vermont': 'Low',
 'New York': 'Lo

### Tratamento do dataset original

#### Tratamento de dados nulos/incompletos

* As linhas que têm valores relativos ao twilight e período do dia vazios serão removidas; 
* As linhas que não têm informação relativa à cidade na qual o acidente ocorre também serão removidas;
* Os códigos TMC (Traffic Message Channel) que sejam nulos serão identificados com um número fora do domínio dos códigos existentes e não vão ser relevantes para possíveis interrogações;
* Definição de valores para dados meteorológicos nulos será feita com base na média de valores em cada atributo, ao nível da cidade, 
na respetiva semana em que ocorreu o acidente (tal será feito na criação da *dimension* Weather);
* O preenchimento de dados quem não contêm fuso horário será feito através da correspondência entre o estado onde o acidente ocorreu e o fuso horário no qual se encontra.

#### Remoção de colunas
* 'Source',
* 'Start_Lat',
* 'Country',
* 'Start_Lng',
* 'End_Lat',
* 'End_Lng',
* 'Description',
* 'Number',
* 'Side',
* 'Zipcode',
* 'Airport_Code',
* 'Weather_Timestamp',
* 'Pressure(in)', 
* 'Wind_Chill(F)'
 

In [10]:
us_acidents.columns

Index(['ID', 'Source', 'TMC', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)',
       'Description', 'Number', 'Street', 'Side', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp',
       'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

In [11]:
# drop unuseful information/columns
us_acidents = us_acidents.drop(['Source', 'Start_Lat', 'Country', 'Start_Lng', 'End_Lat', 'End_Lng', 'Description', 'Number', 'Side', 'Zipcode', 'Airport_Code', 'Weather_Timestamp', 'Pressure(in)', 'Wind_Chill(F)'], axis=1)

In [12]:
# delete rows with non avaliable values
us_acidents = us_acidents.dropna(subset=['City', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']).reset_index(drop=True)

In [13]:
# replace TMC values with out-of-range values, not used in OLAP.
us_acidents['TMC'].fillna(-1, inplace=True)

In [14]:
# replace empty timezone values with the correspondent ones from each state
timezone_df = us_acidents[['State', 'Timezone']].drop_duplicates(subset="State")
timezone_dict = pd.Series(timezone_df['Timezone'].values, index=timezone_df['State']).to_dict()

us_acidents['Timezone'].fillna(us_acidents['State'].map(timezone_dict), inplace=True)

In [15]:
us_acidents.isna().sum()

ID                           0
TMC                          0
Severity                     0
Start_Time                   0
End_Time                     0
Distance(mi)                 0
Street                       0
City                         0
County                       0
State                        0
Timezone                     0
Temperature(F)             879
Humidity(%)                931
Visibility(mi)            1053
Wind_Direction             693
Wind_Speed(mph)           7419
Precipitation(in)        33820
Weather_Condition         1038
Amenity                      0
Bump                         0
Crossing                     0
Give_Way                     0
Junction                     0
No_Exit                      0
Railway                      0
Roundabout                   0
Station                      0
Stop                         0
Traffic_Calming              0
Traffic_Signal               0
Turning_Loop                 0
Sunrise_Sunset               0
Civil_Tw

#### POI Dimension

In [16]:
# add POI_Key
us_acidents.insert(us_acidents.columns.get_loc('Amenity'), 'POI_Key', range(1, len(us_acidents) + 1))

In [17]:
# create POI dataframe
poi_dimension = us_acidents.loc[:,'POI_Key':'Turning_Loop']

poi_dimension.columns

Index(['POI_Key', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction',
       'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop',
       'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop'],
      dtype='object')

In [18]:
poi_dimension.sample(5)

Unnamed: 0,POI_Key,Amenity,Bump,Crossing,Give_Way,Junction,No_Exit,Railway,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop
7308,7309,False,False,False,False,False,False,False,False,False,False,False,False,False
46896,46897,False,False,False,False,False,False,False,False,False,False,False,True,False
39712,39713,False,False,False,False,False,False,False,False,False,False,False,False,False
36480,36481,False,False,True,False,False,False,False,False,False,False,False,True,False
15622,15623,False,False,False,False,False,False,False,False,False,False,False,False,False


In [19]:
# drop unwanted data from the accidents dataframe
#us_acidents = us_acidents.drop(poi_dimension.columns[1:], axis=1)

# accidents dataframe with POI_Key
us_acidents.columns

Index(['ID', 'TMC', 'Severity', 'Start_Time', 'End_Time', 'Distance(mi)',
       'Street', 'City', 'County', 'State', 'Timezone', 'Temperature(F)',
       'Humidity(%)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)',
       'Precipitation(in)', 'Weather_Condition', 'POI_Key', 'Amenity', 'Bump',
       'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout',
       'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',
       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight'],
      dtype='object')

#### Location Dimension

##### Inserção de Chave

In [20]:
# add location key
us_acidents.insert(us_acidents.columns.get_loc('Street'), 'LocationKey', range(1, len(us_acidents) + 1))

##### Definição dos atributos de região

In [21]:
us_acidents['State'].unique()

array(['OH', 'CA', 'FL', 'GA', 'SC', 'NE', 'IA', 'IL', 'MO', 'WI', 'MI',
       'MA', 'CT', 'NY', 'NJ', 'RI', 'NH', 'VA', 'PA', 'MD', 'DE', 'DC',
       'TX', 'WA', 'OR', 'IN', 'TN', 'AZ', 'OK', 'NC', 'AL', 'UT', 'MN',
       'KY', 'LA', 'CO', 'KS', 'NV', 'MS', 'NM', 'AR', 'VT', 'WY', 'WV',
       'ID', 'ME', 'MT', 'SD', 'ND'], dtype=object)

In [22]:
# Eliminar??
# Muitos dados que temos não incluem este estado - Não é um estado
us_acidents[us_acidents["State"] != 'DC'].reset_index(drop=True, inplace=True)

In [23]:
replace_values = {'OH':'Ohio', 'WV':'West Virginia', 'CA': 'California', 'FL': 'Florida', 'GA': 'Georgia', 'SC':'South Carolina', 'NE': 'Nebraska', 'IA': 'Iowa', 'IL': 'Illinois', 'MO': 'Missouri', 'WI': 'Wisconsin',
       'IN': 'Indiana', 'MI': 'Michigan', 'NJ': 'New Jersey', 'NY': 'New York', 'CT': 'Connecticut', 'MA': 'Massachusetts', 'RI': 'Rhode Island', 'NH': 'New Hampshire', 'PA': 'Pennsylvania', 'KY': 'Kentucky', 'MD': 'Maryland',
       'VA': 'Virginia', 'DE': 'Delaware', 'TX':'Texas', 'WA': 'Washington', 'OR': 'Oregon', 'AL': 'Alabama', 'NC': 'North Carolina', 'MN': 'Minnesota', 'OK': 'Oklahoma', 'LA': 'Louisiana',
   'TN': 'Tennessee', 'UT': 'Utah', 'CO': 'Colorado', 'AZ': 'Arizona', 'NV': 'Nevada', 'KS': 'Kansas', 'MS': 'Mississippi', 'NM': 'New Mexico', 'ME': 'Maine', 'AR': 'Arkansas', 'WY': 'Wyoming','VT': 'Vermont', 'ID': 'Idaho', 'ND': 'North Dakota', 'MT': 'Montana', 'SD': 'South Dakota'}

In [24]:
us_acidents = us_acidents.replace({"State": replace_values})

In [25]:
# https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#Census_Bureau-designated_regions_and_divisions
sub_regions = {
    'New England': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont'],
    'Mid Atlantic': ['New Jersey', 'New York','Pennsylvania'],
    'East North Central': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin'],
    'West North Central': ['Iowa','Kansas','Minnesota','Missouri','Nebraska','North Dakota','South Dakota'],
    'South Atlantic': ['Delaware','Florida','Georgia','Maryland','North Carolina','South Carolina','Virginia','West Virginia'],
    'East South Central': ['Alabama','Kentucky','Mississippi','Tennessee'],
    'West South Central': ['Arkansas','Louisiana','Oklahoma','Texas'],
    'Mountain': ['Arizona','Colorado','Idaho','Montana','Nevada','New Mexico','Utah','Wyoming'],
    'Pacific': ['Alaska','California','Hawaii','Oregon','Washington']
}

regions = {
    'Northeast': ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York','Pennsylvania'],
    'Midwest': ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa','Kansas','Minnesota','Missouri','Nebraska','North Dakota','South Dakota'],
    'South': ['Delaware','Florida','Georgia','Maryland','North Carolina','South Carolina','Virginia','District of Columbia','West Virginia', 'Alabama','Kentucky','Mississippi','Tennessee', 'Arkansas','Louisiana','Oklahoma','Texas'],
    'West': ['Arizona','Colorado','Idaho','Montana','Nevada','New Mexico','Utah','Wyoming', 'Alaska','California','Hawaii','Oregon','Washington']
}

# create an inverted dictionary map
regions_dict = dict([(v, k) for k, l in regions.items() for v in l])
sub_regions_dict = dict([(v, k) for k, l in sub_regions.items() for v in l])

##### Inserção dos atributos Região e Sub-Região

In [26]:
# add region
us_acidents.insert(us_acidents.columns.get_loc("Timezone"), 'Region', us_acidents['State'].map(regions_dict))
us_acidents['Region'].value_counts()

South        20893
West         16090
Midwest       6996
Northeast     5960
Name: Region, dtype: int64

In [27]:
# add subregion
us_acidents.insert(us_acidents.columns.get_loc('Timezone'), 'Subregion', us_acidents['State'].map(sub_regions_dict))
us_acidents['Subregion'].value_counts()

Pacific               13473
South Atlantic        12250
West South Central     6654
East North Central     4752
Mid Atlantic           4679
Mountain               2617
West North Central     2244
East South Central     1989
New England            1281
Name: Subregion, dtype: int64

##### Inserção de atributo AlcoholConsumptionRate

In [28]:
# add alcohol consumption
us_acidents.insert(us_acidents.columns.get_loc('Timezone'), 'AlcoholConsumptionRate', us_acidents['State'].map(alcohol_dict))
us_acidents['AlcoholConsumptionRate'].value_counts()

Medium       36521
High          9222
Low           4011
Very High      185
Name: AlcoholConsumptionRate, dtype: int64

#### Inserção do atributo NumberVehicleRegistrations

In [29]:
# add number of vehicles
us_acidents.insert(us_acidents.columns.get_loc('Timezone'), 'VehicleRegistrations', us_acidents['State'].map(vehicle_dict))
us_acidents['VehicleRegistrations'].value_counts()

Medium       14678
High         10064
Low           8637
Very High     4942
Name: VehicleRegistrations, dtype: int64

#### Inserção do atributo UrbanRuralClassification

In [30]:
# add rate
us_acidents.insert(us_acidents.columns.get_loc('Timezone'), 'UrbanRuralClassification', us_acidents['County'].map(urban_dict))

# ~81000 rows dropped - Errors in County, only with manual correction allowed
us_acidents = us_acidents[us_acidents['UrbanRuralClassification'].notna()]

us_acidents['UrbanRuralClassification'].value_counts()

Large Central Metro    18928
Non-Core                9372
Medium Metro            7323
Large Fringe Metro      7048
Micropolitan            3084
Small Metro             2534
Name: UrbanRuralClassification, dtype: int64

#### Inserção do atributo MaximumSpeedLimit

In [31]:
# add max speed limit
us_acidents.insert(us_acidents.columns.get_loc('Timezone'), 'MaximumSpeedLimit', us_acidents['State'].map(speed_dict))

us_acidents['MaximumSpeedLimit'].value_counts()

Medium    38036
High       5841
Low        4356
Name: MaximumSpeedLimit, dtype: int64

TBD:
* MoonCalendar

In [32]:
# create location dataframe
location_dimension = us_acidents.loc[:,'LocationKey':'MaximumSpeedLimit']

location_dimension.columns

Index(['LocationKey', 'Street', 'City', 'County', 'State', 'Region',
       'Subregion', 'AlcoholConsumptionRate', 'VehicleRegistrations',
       'UrbanRuralClassification', 'MaximumSpeedLimit'],
      dtype='object')

In [33]:
# drop unwanted data from the accidents dataframe
#us_acidents = us_acidents.drop(location_dimension.columns[1:], axis=1)

us_acidents.columns

Index(['ID', 'TMC', 'Severity', 'Start_Time', 'End_Time', 'Distance(mi)',
       'LocationKey', 'Street', 'City', 'County', 'State', 'Region',
       'Subregion', 'AlcoholConsumptionRate', 'VehicleRegistrations',
       'UrbanRuralClassification', 'MaximumSpeedLimit', 'Timezone',
       'Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'POI_Key',
       'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
       'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
       'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight'],
      dtype='object')

#### Date Dimension

##### Conversão dos registos Start_Time e End_Time para Datetime

In [34]:
# convert to timestamp datatype 
us_acidents['Start_Time'] = pd.to_datetime(us_acidents['Start_Time'])
us_acidents['End_Time'] = pd.to_datetime(us_acidents['End_Time'])

# rename columns
us_acidents = us_acidents.rename(columns={'Start_Time': 'Start_Datetime', 'End_Time': 'End_Datetime'})

##### Inserção de Chave

In [35]:
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'DateKey', range(1, len(us_acidents) + 1))

##### Inserção dos atributos StartDay, StartMonth e StartYear

In [36]:
# add starting date
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'StartDay', us_acidents['Start_Datetime'].map(lambda x: x.day))
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'StartMonth', us_acidents['Start_Datetime'].map(lambda x: x.month_name()))
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'StartYear', us_acidents['Start_Datetime'].map(lambda x: x.year))

##### Inserção dos atributos EndDay, EndMonth e EndYear

In [37]:
# add ending date
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'EndDay', us_acidents['End_Datetime'].map(lambda x: x.day)) 
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'EndMonth', us_acidents['End_Datetime'].map(lambda x: x.month_name())) 
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'EndYear', us_acidents['End_Datetime'].map(lambda x: x.year))

##### Inserção do atributo DayOfWeek

In [38]:
# add day of the week
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'DayOfWeek', us_acidents['Start_Datetime'].map(lambda x: x.day_name()))
us_acidents['DayOfWeek'].value_counts()

Tuesday      8781
Friday       8766
Wednesday    8766
Thursday     8467
Monday       8127
Saturday     2875
Sunday       2507
Name: DayOfWeek, dtype: int64

##### Inserção do atributo WeekdayWeekend

In [39]:
week_dict = {
    'Monday': 'Week Day',
    'Tuesday': 'Week Day',
    'Wednesday': 'Week Day',
    'Thursday': 'Week Day',
    'Friday': 'Week Day',
    'Saturday': 'Weekend',
    'Sunday': 'Weekend'
}

In [40]:
# add week day/weekend
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'WeekdayWeekend', us_acidents['DayOfWeek'].map(week_dict))
us_acidents[['Start_Datetime', 'DayOfWeek', 'WeekdayWeekend']].sample(5)

Unnamed: 0,Start_Datetime,DayOfWeek,WeekdayWeekend
14189,2019-08-05 20:43:55,Monday,Week Day
33460,2017-12-23 13:05:52,Saturday,Weekend
32822,2017-12-06 08:23:45,Wednesday,Week Day
36042,2017-09-14 17:19:34,Thursday,Week Day
43175,2019-11-01 21:29:00,Friday,Week Day


##### Inserção dos atributos Quarter (trimester) e WeekNumber

In [41]:
# add quarter and week number
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'Quarter', us_acidents['Start_Datetime'].map(lambda x: x.quarter))
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'WeekNumber', us_acidents['Start_Datetime'].map(lambda x: x.weekofyear))
us_acidents[['Start_Datetime','Quarter','WeekNumber']].sample(5)

Unnamed: 0,Start_Datetime,Quarter,WeekNumber
207,2017-01-13 18:19:44,1,2
36091,2017-09-15 12:51:11,3,37
17473,2019-03-06 08:48:24,1,10
41192,2017-05-17 17:24:32,2,20
17681,2019-03-13 17:22:20,1,11


##### Inserção do atributo Holiday

In [42]:
federal_holidays = pd.read_csv('us_federal_hol.csv', delimiter=';')
federal_holidays['date'] = pd.to_datetime(federal_holidays['date']).dt.date
federal_holidays.sample(5)

Unnamed: 0,date,holiday,exceptions
30,2018-11-11,Veterans Day,
12,2017-02-01,New Year's Day observed,
5,2016-05-09,Labor Day,
26,2018-05-28,Memorial Day,
23,2018-01-01,New Year's Day,


In [43]:
holidays_dict = pd.Series(federal_holidays['holiday'].values, index=federal_holidays['date']).to_dict()
holidays_dict

{datetime.date(2016, 1, 1): "New Year's Day",
 datetime.date(2016, 1, 18): '3tin Luther King Jr. Day',
 datetime.date(2016, 2, 15): "Presidents' Day",
 datetime.date(2016, 5, 30): 'Memorial Day',
 datetime.date(2016, 4, 7): 'Independence Day',
 datetime.date(2016, 5, 9): 'Labor Day',
 datetime.date(2016, 10, 10): 'Columbus Day',
 datetime.date(2016, 11, 11): 'Veterans Day',
 datetime.date(2016, 11, 24): 'Thanksgiving Day',
 datetime.date(2016, 12, 25): 'Christmas Day',
 datetime.date(2016, 12, 26): 'Christmas Day observed',
 datetime.date(2017, 1, 1): "New Year's Day",
 datetime.date(2017, 2, 1): "New Year's Day observed",
 datetime.date(2017, 1, 16): '3tin Luther King Jr. Day',
 datetime.date(2017, 2, 20): "Presidents' Day",
 datetime.date(2017, 5, 29): 'Memorial Day',
 datetime.date(2017, 4, 7): 'Independence Day',
 datetime.date(2017, 4, 9): 'Labor Day',
 datetime.date(2017, 9, 10): 'Columbus Day',
 datetime.date(2017, 10, 11): 'Veterans Day observed',
 datetime.date(2017, 11, 11): 

In [44]:
# add holiday
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'Holiday', us_acidents['Start_Datetime'].dt.date.map(holidays_dict))
us_acidents['Holiday'] = us_acidents['Holiday'].fillna('Not Holiday')
us_acidents['Holiday'].value_counts()

Not Holiday                 47100
Columbus Day                  176
Presidents' Day               129
Veterans Day                  126
3tin Luther King Jr. Day      115
Veterans Day observed         104
Labor Day                      95
Independence Day               90
Thanksgiving Day               82
Christmas Eve                  79
Memorial Day                   75
Christmas Day                  53
New Year's Day observed        36
New Year's Day                 26
Christmas Day observed          3
Name: Holiday, dtype: int64

##### Inserção do atributo WorkDay

In [45]:
# create a boolean mask by doing element-wise logical OR
work_days_mask = (us_acidents['Holiday'] != 'Not Holiday') | (us_acidents['WeekdayWeekend'] == 'Weekend')

# add work days
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'WorkDay', work_days_mask.map({False: 'Work Day', True: 'Not Work Day'}))
us_acidents['WorkDay'].value_counts()

Work Day        41843
Not Work Day     6446
Name: WorkDay, dtype: int64

In [46]:
us_acidents[us_acidents['WorkDay'] == 'Not Work Day'][['Start_Datetime', 'DayOfWeek', 'Holiday', 'WorkDay']].sample(5)

Unnamed: 0,Start_Datetime,DayOfWeek,Holiday,WorkDay
27078,2018-06-17 10:16:34,Sunday,Not Holiday,Not Work Day
13352,2019-07-06 14:28:09,Saturday,Not Holiday,Not Work Day
44177,2019-03-10 16:45:05,Sunday,Not Holiday,Not Work Day
28475,2018-05-27 14:35:00,Sunday,Not Holiday,Not Work Day
12993,2019-08-25 14:44:46,Sunday,Not Holiday,Not Work Day


##### Inserção do atributo SchoolBreak

In [47]:
school_holidays = pd.read_csv('school_holidays.csv', delimiter=';')
school_holidays['Start']= pd.to_datetime(school_holidays['Start'])
school_holidays['End']= pd.to_datetime(school_holidays['End'])

In [48]:
# initialize mask
classes_mask = pd.Series([False for _ in range(len(us_acidents))])

for i in range(len(school_holidays)):
    classes_mask |= (us_acidents['Start_Datetime'] >= school_holidays.loc[i, "Start"]) & (us_acidents['Start_Datetime'] <= school_holidays.loc[i, "End"])

# add school breaks
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'SchoolBreak', classes_mask.map(lambda x: 'No Classes' if x else 'Classes'))
us_acidents['SchoolBreak'].value_counts()

Classes       36532
No Classes    10090
Name: SchoolBreak, dtype: int64

##### Inserção do atributo MoonCalendar

In [49]:
# add moon calendar --->> Não concluído
us_acidents.insert(us_acidents.columns.get_loc('Start_Datetime'), 'MoonCalendar', range(1, len(us_acidents) + 1))

>> Falta moon calendar

In [50]:
# create date dataframe
date_dimension = us_acidents.loc[:, 'DateKey':'MoonCalendar']

date_dimension.columns

Index(['DateKey', 'StartDay', 'StartMonth', 'StartYear', 'EndDay', 'EndMonth',
       'EndYear', 'DayOfWeek', 'WeekdayWeekend', 'Quarter', 'WeekNumber',
       'Holiday', 'WorkDay', 'SchoolBreak', 'MoonCalendar'],
      dtype='object')

In [51]:
# drop unwanted data from the accidents dataframe
#us_acidents = us_acidents.drop(date_dimension.columns[1:], axis=1)

#us_acidents.columns

### Weather Dimension
* Definição de valores para dados meteorológicos nulos será feita com base na média de valores em cada atributo, ao nível da cidade, 
na respetiva semana em que ocorreu o acidente (tal será feito na criação da *dimension* Weather);

In [52]:
us_acidents[us_acidents["Region"].isna()]

Unnamed: 0,ID,TMC,Severity,DateKey,StartDay,StartMonth,StartYear,EndDay,EndMonth,EndYear,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
4003,A-233367,201.0,2,3940,29,December,2016,29,December,2016,...,False,False,False,False,False,False,Day,Day,Day,Day
4052,A-235363,201.0,3,3984,11,January,2017,11,January,2017,...,False,False,False,False,False,False,Day,Day,Day,Day
4126,A-239332,201.0,2,4040,1,November,2016,1,November,2016,...,False,False,False,False,True,False,Day,Day,Day,Day
4156,A-240858,201.0,3,4063,9,November,2016,9,November,2016,...,False,False,False,False,False,False,Day,Day,Day,Day
4495,A-259189,241.0,2,4365,15,July,2016,15,July,2016,...,False,False,False,False,False,False,Day,Day,Day,Day
6577,A-380661,201.0,3,6407,6,March,2017,6,March,2017,...,False,False,False,False,False,False,Day,Day,Day,Day
8377,A-482621,201.0,2,8168,26,June,2017,26,June,2017,...,False,False,False,False,False,False,Day,Day,Day,Day
8392,A-483466,201.0,2,8179,3,July,2017,3,July,2017,...,False,False,False,False,True,False,Day,Day,Day,Day
8774,A-506941,241.0,3,8555,21,July,2017,21,July,2017,...,False,False,False,False,False,False,Night,Night,Day,Day
9025,A-524285,201.0,2,8802,3,December,2019,3,December,2019,...,False,False,False,False,True,False,Night,Night,Night,Night


In [54]:
us_acidents[["Temperature(F)", "State", "WeekNumber", "StartYear"]].sort_values(by=['State', 'WeekNumber', 'StartYear'], ascending=True)

Unnamed: 0,Temperature(F),State,WeekNumber,StartYear
21240,73.4,Alabama,1,2018
21241,69.8,Alabama,1,2018
31628,24.6,Alabama,1,2018
19324,55.0,Alabama,1,2019
19325,52.0,Alabama,1,2019
...,...,...,...,...
49241,34.0,Wyoming,18,2019
38189,77.7,Wyoming,39,2016
45767,33.1,Wyoming,43,2018
34293,44.6,Wyoming,47,2017


#### Discretização dos valores contínuos referentes a meteorologia

In [55]:
def discretize_humidity(x):
    if x <= 25.0:
        return '(Low) rate <= 25.0'
    elif 25.0 < x <= 50.0:
        return '(Medium) 25.0 < rate <= 50.0'
    elif 50.0 < x <= 75.0:
        return '(High) 50.0 < rate <= 75.0'
    else:
        return '(Very High) rate > 75.0'

def discretize_temp(x):
    if x <= 32.0:
        return '(Cold) temp <= 32.0'
    elif 32.0 < x <= 50.0:
        return '(Low) 32.0 < temp <= 50.0'
    elif 50.0 < x <= 80.0:
        return '(Moderate) 50.0 < temp <= 80.0'
    elif 80.0 < x <= 100.0:
        return '(Warm) 80.0 < temp <= 100.0'
    else:
        return "(Hot) temp > 100.0"

def discretize_visibility(x):
    if x <= 1.0:
        return '(Low) visibility <= 1.0'
    elif 1.0 < x <= 2.0:
        return '(Reduced) 1.0 < visibility <= 2.0'
    elif 2.0 < x <= 5.0:
        return '(Moderate) 2.0 < visibility <= 5.0'
    elif 5.0 < x <= 20.0:
        return '(Good) 5.0 < visibility <= 20.0'
    elif 20.0 < x <= 50.0:
        return '(Very Good) 20.0 < visibility <= 50.0'
    else:
        return '(Excellent) visibility > 50.0'

#https://en.wikipedia.org/wiki/Beaufort_scale - converting m/s to mph
def discretize_windspeed(x):
    if x <= 1.12:
        return '(Calm) speed <= 1.0'
    elif 1.12 < x <= 3.36:
        return '(Light Air) 1.12 < speed <= 3.36'
    elif 3.36 < x <= 7.38:
        return '(Light Breeze) 3.36 < speed <= 7.38'
    elif 7.38 < x <= 12.30:
        return '(Gentle Breeze) 7.38 < speed <= 12.30'
    elif 12.30 < x <= 17.67:
        return '(Moderate Breeze) 12.30 < speed <= 17.67'
    elif 17.67 < x <= 23.93:
        return '(Fresh Breeze) 17.67 < speed <= 23.93'
    elif 23.93 < x <= 30.87:
        return '(Strong Breeze) 23.93 < speed <= 30.87'
    elif 30.87 < x <= 38.25:
        return '(High Wind)30.87 < speed <= 38.25'
    elif 38.25 < x <= 46.30:
        return '(Gale) 38.25 < speed <= 46.30'
    elif 46.30 < x <= 54.58:
        return '(Strong Gale) 46.30 < speed <= 54.58'
    elif 54.58 < x <= 63.53:
        return '(Storm) 54.58 < speed <= 63.53'
    elif 63.53 < x <= 72.92:
        return '(Violent Storm) 63.53 < speed <= 72.92' 
    else:
        return '(Hurricane) speed > 72.92'

# https://en.wikipedia.org/wiki/Rain#Intensity
def discretize_rainfall(x):
    if x <= 0.1:
        return '(Light) rate <= 0.1'
    elif 0.1 < x <= 0.39:
        return '(Moderate) 0.1 < rate <= 0.3'
    elif 0.39 < x <= 2.0:
        return '(Heavy) 0.39 < rate <= 2.0'
    else:
        return '(Violent) rate > 2.0'

us_acidents['Temperature(F)'] = us_acidents["Temperature(F)"].map(discretize_temp)
us_acidents['Humidity(%)'] = us_acidents['Humidity(%)'].map(discretize_humidity)
us_acidents['Visibility(mi)'] = us_acidents['Visibility(mi)'].map(discretize_visibility)
us_acidents['Wind_Speed(mph)'] = us_acidents['Wind_Speed(mph)'].map(discretize_windspeed)
us_acidents['Precipitation(in)'] = us_acidents['Precipitation(in)'].map(discretize_rainfall)

#### LocalTime Dimension


In [56]:
us_acidents['LocalTimeKey'] = pd.RangeIndex(start=1, stop=us_acidents.shape[0]+1)

In [57]:
us_acidents['LocalTimeKey']

0            1
1            2
2            3
3            4
4            5
         ...  
49995    48285
49996    48286
49997    48287
49998    48288
49999    48289
Name: LocalTimeKey, Length: 48289, dtype: int64

In [58]:
us_acidents['StartTime'] = pd.DatetimeIndex(us_acidents['Start_Datetime']).time
us_acidents['EndTime'] = pd.DatetimeIndex(us_acidents['End_Datetime']).time

In [59]:
def define_period_of_day(start_hour, civil_tw, naut_tw, ast_tw, sunrise_sunset) :
    if (civil_tw != naut_tw) or (naut_tw != ast_tw) :
        return "Dusk"
    elif dt.time(8, 0, 0) < start_hour < dt.time(10, 0, 0) :
        return 'Morning Rush Hour'
    elif dt.time(12, 30, 0) < start_hour < dt.time(14, 30, 0) :
        return 'Lunch Time'
    elif dt.time(16, 0, 0) < start_hour < dt.time(18, 0, 0) :
        return 'Evening Rush Hour'
    else:
        return sunrise_sunset

us_acidents['PeriodOfDay'] = us_acidents.apply(lambda x: define_period_of_day(x['StartTime'], x['Civil_Twilight'], x['Nautical_Twilight'], x['Astronomical_Twilight'], x['Sunrise_Sunset']), axis=1)

In [60]:
localtime_dimension = us_acidents[['LocalTimeKey', 'StartTime', 'PeriodOfDay', 'Timezone', 'EndTime']]

In [61]:
#us_acidents.drop(['PeriodOfDay', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'], axis=1, inplace=True)
us_acidents.columns

Index(['ID', 'TMC', 'Severity', 'DateKey', 'StartDay', 'StartMonth',
       'StartYear', 'EndDay', 'EndMonth', 'EndYear', 'DayOfWeek',
       'WeekdayWeekend', 'Quarter', 'WeekNumber', 'Holiday', 'WorkDay',
       'SchoolBreak', 'MoonCalendar', 'Start_Datetime', 'End_Datetime',
       'Distance(mi)', 'LocationKey', 'Street', 'City', 'County', 'State',
       'Region', 'Subregion', 'AlcoholConsumptionRate', 'VehicleRegistrations',
       'UrbanRuralClassification', 'MaximumSpeedLimit', 'Timezone',
       'Temperature(F)', 'Humidity(%)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'POI_Key',
       'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
       'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
       'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight',
       'Nautical_Twilight', 'Astronomical_Twilight', 'LocalTimeKey',
       'StartTime', 'EndTime', 'PeriodOfDay'],
      d

#### StandardTime Dimension

* StandardTimeKey
* StartHour (no timezone Standard)
* EndHour (no timezone Standard)
* StandardTimeZone (GMT?)

In [62]:
us_acidents["Timezone"].value_counts()

US/Eastern     20776
US/Pacific     13651
US/Central     11248
US/Mountain     2614
Name: Timezone, dtype: int64

In [63]:
# Creating the Standard Time Key
us_acidents['StandardTimeKey'] = pd.RangeIndex(start=1, stop=us_acidents.shape[0]+1)

In [None]:
import pytz

def convert_timezone(timestamp, old_timezone, new_timezone) :
    localized_timestamp = pytz.timezone(old_timezone).localize(timestamp)
    new_timezone_timestamp = localized_timestamp.astimezone(pytz.timezone(new_timezone))
    return dt.time(new_timezone_timestamp.hour, new_timezone_timestamp.minute, new_timezone_timestamp.second)

us_acidents['StandardStartTime'] = us_acidents.apply(lambda x: convert_timezone(x['Start_Datetime'], x['Timezone'], "Etc/GMT"), axis=1)
us_acidents['StandardEndTime'] = us_acidents.apply(lambda x: convert_timezone(x['End_Datetime'], x['Timezone'], "Etc/GMT"), axis=1)

In [None]:
us_acidents["Standard_Timezone"] = "GMT"
us_acidents[['StandardTimeKey','StartTime','EndTime','Timezone','StandardStartTime','StandardEndTime', 'Standard_Timezone']]
standardtime_dimension = us_acidents[['StandardTimeKey', 'StandardStartTime','StandardEndTime', 'Standard_Timezone']]
#us_acidents.drop(['Start_Datetime', 'End_Datetime', 'Timezone', 'StartTime', 'EndTime', 'StandardStartTime', 'StandardEndTime', 'Standard_Timezone'], axis=1, inplace=True)
us_acidents.columns

In [None]:
standardtime_dimension