### Star Schema

In [None]:
from PIL import Image
import requests
import matplotlib.pyplot as plt

# Corrigir import do star schema
url = 'https://i.imgur.com/1prxQ4H.png'
response = requests.get(url, stream=True)
img = Image.open(response.raw)

plt.imshow(img)
plt.show()

### Importação dos datasets relevantes ao problema

Datasets a importar:
* US Accidents
* TMC's e eventos associados
* Calendário com fases da lua (\*)
* Limite máximo de velocidade por estado
* Dados de consumo de álcool por estado
* Quantidade de automóveis registados por estado

In [None]:
import pandas as pd
import re
from datetime import datetime

##### Importação do dataset US Accidents

In [None]:
# Import us accidents dataset
us_acidents = pd.read_csv('US_Accidents_Dec19.csv')
us_acidents[:5]

##### Importação do dataset relativo a TMCs / Criação da Dimensão TMC

In [None]:
# import event codes
event_code = pd.read_csv('event_code_tmc.csv', delimiter=';')
event_code = event_code.rename(columns={"Code": "TMC_Key", "Description":"Event"})

# add out-of-range key
event_code = event_code.append({'TMC_Key': -1, 'Event': 'unidentified'}, ignore_index=True)
event_code[:5]

##### Importação do dataset sobre consumo de álcool

In [None]:
# import alcohool consumption rates by state
alcohol_consumption = pd.read_csv('alcohol_consumption_state.csv', delimiter=';')

alcohol_consumption.at[20, "State"] = "Massachusetts"

alcohol_consumption.loc[alcohol_consumption['rate'].apply(lambda x : x <= 2.0), 'discretizeRate'] = 'Low'
alcohol_consumption.loc[alcohol_consumption['rate'].apply(lambda x : (x > 2.0) & (x <= 2.5)), 'discretizeRate'] = 'Medium'
alcohol_consumption.loc[alcohol_consumption['rate'].apply(lambda x : (x > 2.5) & (x <= 3.5)), 'discretizeRate'] = 'High'
alcohol_consumption.loc[alcohol_consumption['rate'].apply(lambda x : x > 3.5), 'discretizeRate'] = 'Very High'

alcohol_consumption = alcohol_consumption.sort_values(by=['rate'], ascending=False)
print(alcohol_consumption[:5])

# Create dictionary where each key corresponds to a state' rate of alcoholism
alcohol_dict = pd.Series(alcohol_consumption['discretizeRate'].values,index=alcohol_consumption['State']).to_dict()
alcohol_dict

##### Importação do dataset sobre registos automóveis

In [None]:
# import vehicle registrations by state
vehicle_registration = pd.read_csv('vehicle_registrations_usa.csv',delimiter=';')
vehicle_registration = vehicle_registration[vehicle_registration['State'] != 'Dist. of Col.'].reset_index(drop=True)

vehicle_registration.at[4, "State"] = "California"
vehicle_registration.at[6, "State"] = "Connecticut"


for i in range(len(vehicle_registration.index)):
    vehicle_registration.at[i, "Total"] = int(re.sub(' ', '', vehicle_registration.loc[i]["Total"]))

vehicle_registration = vehicle_registration.sort_values(by=['Total'], ascending=False)

vehicle_registration.loc[vehicle_registration['Total'].apply(lambda x : x <= 5000000), 'discretizeTotal'] = 'Low'
vehicle_registration.loc[vehicle_registration['Total'].apply(lambda x : (x > 5000000) & (x <= 10000000)), 'discretizeTotal'] = 'Medium'
vehicle_registration.loc[vehicle_registration['Total'].apply(lambda x : (x > 10000000) & (x <= 20000000)), 'discretizeTotal'] = 'High'
vehicle_registration.loc[vehicle_registration['Total'].apply(lambda x : x > 20000000), 'discretizeTotal'] = 'Very High'

print(vehicle_registration[:5])

vehicle_dict = pd.Series(vehicle_registration['discretizeTotal'].values,index=vehicle_registration['State']).to_dict()
vehicle_dict

##### Importação do dataset sobre nível de urbanização

In [None]:
# import rural/urban information about counties
urban = pd.read_excel('NCHSURCodes2013.xlsx')

# drop unwanted information
urban = urban.drop(['State Abr.','CBSA title', 'CBSA 2012 pop', 'County 2012 pop'], axis=1)

for i in range(len(urban.index)):
    urban.at[i, "County name"] = re.sub(' County', '', urban.loc[i]["County name"])
    
urban_term = {1:'Large Central Metro',
              2:'Large Fringe Metro',
              3:'Medium Metro',
              4:'Small Metro',
              5:'Micropolitan',
              6:'Non-Core'}

urban["2013 code"] = urban["2013 code"].map(urban_term)

urban["County name"] = urban["County name"].replace('DeSoto', 'De Soto')
urban["County name"] = urban["County name"].replace('St. Lucie', 'Saint Lucie')
urban["County name"] = urban["County name"].replace('DeKalb', 'Dekalb')


urban_dict = pd.Series(urban['2013 code'].values,index=urban['County name']).to_dict()
urban_dict

##### Importação do dataset sobre limites de velocidade

In [None]:
# import maximal speed limits by state
speed_limits = pd.read_excel('speed_limit_state.xlsx')
speed_limits = speed_limits.drop(['Freeway (trucks)','Freeway (urban)','Divided (rural)','Undivided (rural)','Residential'], axis=1)

speed_limits = speed_limits.rename(columns={"State or territory": "State", "Freeway (rural)":"Max Speed Limit (mph)"})

# List of all the US States
us_states_list = ["Alabama","Alaska","Arizona","Arkansas","California","Colorado",
  "Connecticut","Delaware","Florida","Georgia","Hawaii","Idaho","Illinois",
  "Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland",
  "Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana",
  "Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York",
  "North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania",
  "Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah",
  "Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"]

# Reformat all the rows in the dataset
for i in range(len(speed_limits.index)):
    if(not pd.isna(speed_limits.loc[i]["State"])):
        speed_limits.at[i, "State"] = re.sub('[^a-z A-Z]+', '', speed_limits.loc[i]["State"])
        speed_limits.at[i, "State"] = speed_limits.loc[i]["State"].replace('\t','')
    if(not pd.isna(speed_limits.loc[i]["Max Speed Limit (mph)"])):
        speed_limits.at[i, "Max Speed Limit (mph)"] = re.sub('\(.*', '', speed_limits.loc[i]["Max Speed Limit (mph)"])

# Filter only the US States
for i in range(len(speed_limits.index)):
    if(speed_limits.loc[i]["State"] not in us_states_list):
        speed_limits = speed_limits.drop(i)
    
# Reset the index
speed_limits = speed_limits.reset_index(drop=True)

speed_limits["Max Speed Limit (mph)"] = speed_limits['Max Speed Limit (mph)'].apply(lambda x: (x[:-4])[-3:])
speed_limits["Max Speed Limit (mph)"] = speed_limits['Max Speed Limit (mph)'].apply(lambda x: int(x))
speed_limits.at[36, "Max Speed Limit (mph)"] = 70

speed_limits = speed_limits.sort_values(by=['Max Speed Limit (mph)'], ascending=False)

speed_limits.loc[speed_limits['Max Speed Limit (mph)'].apply(lambda x : x < 70), 'DiscretizeSpeedLimit'] = 'Low'
speed_limits.loc[speed_limits['Max Speed Limit (mph)'].apply(lambda x : (x >= 70) & (x < 80)), 'DiscretizeSpeedLimit'] = 'Medium'
speed_limits.loc[speed_limits['Max Speed Limit (mph)'].apply(lambda x : (x >= 80)), 'DiscretizeSpeedLimit'] = 'High'

speed_dict = pd.Series(speed_limits['DiscretizeSpeedLimit'].values,index=speed_limits['State']).to_dict()
speed_dict

### Tratamento do dataset original

#### Tratamento de dados nulos/incompletos

* As linhas que têm valores relativos ao twilight e período do dia vazios serão removidas; 
* As linhas que não têm informação relativa à cidade na qual o acidente ocorre também serão                removidas;
* Os códigos TMC (Traffic Message Channel) que sejam nulos serão identificados com um número fora do domínio dos códigos existentes e não vão ser relevantes para possíveis interrogações;
* Definição de valores para dados meteorológicos nulos será feita com base na média de valores em cada atributo, ao nível da cidade, na respetiva semana em que ocorreu o acidente (\*);
* O preenchimento de dados quem não contêm fuso horário será feito através da correspondência entre o estado onde o acidente ocorreu e o fuso horário no qual se encontra.

#### Remoção de colunas
* 'Source',
* 'Start_Lat',
* 'Country',
* 'Start_Lng',
* 'End_Lat',
* 'End_Lng',
* 'Description',
* 'Number',
* 'Side',
* 'Zipcode',
* 'Airport_Code',
* 'Weather_Timestamp',
* 'Pressure(in)', 
* 'Wind_Chill(F)'
 

In [None]:
us_acidents.columns

In [None]:
# drop unuseful information/columns
us_acidents = us_acidents.drop(['Source','Start_Lat','Country','Start_Lng','End_Lat','End_Lng','Description','Number','Side','Zipcode','Airport_Code','Weather_Timestamp', 'Pressure(in)', 'Wind_Chill(F)'], axis=1)

In [None]:
# delete rows with non avaliable values
us_acidents = us_acidents.dropna(subset=['City', 'Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']).reset_index(drop=True)

In [None]:
# Replace TMC values with out-of-range values, not used in OLAP.
us_acidents['TMC'].fillna(-1, inplace=True)

In [None]:
# Replace empty timezone values with the correspondent ones from each state
timezone_df = us_acidents[["State", "Timezone"]].drop_duplicates(subset="State")
timezone_dict = pd.Series(timezone_df['Timezone'].values,index=timezone_df['State']).to_dict()

us_acidents['Timezone'].fillna(us_acidents['State'].map(timezone_dict), inplace=True)

#### POI Dimension

In [None]:
# Add POI_Key
us_acidents.insert(us_acidents.columns.get_loc("Amenity"), 'POI_Key', range(1, 1 + len(us_acidents)))
us_acidents['POI_Key'].unique()

In [None]:
poi_dimension = us_acidents.loc[:,'POI_Key':'Turning_Loop'] # create poi dataset
poi_dimension.columns

In [None]:
poi_dimension[:5]

In [None]:
#drop unwanted data from the us_acidents dataframe
us_acidents = us_acidents.drop(['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop'], axis=1)

In [None]:
# State of Accidents Dataset, now with POI_Key
us_acidents.columns

#### Location Dimension

##### Inserção de Chave

In [None]:
us_acidents.insert(us_acidents.columns.get_loc("Street"), 'LocationKey', range(1, 1 + len(us_acidents))) # add location_key

##### Definição dos atributos de região

In [None]:
us_acidents['State'].unique()

In [None]:
# Eliminar??
# Muitos dados que temos não incluem este estado - Não é um estado
print(us_acidents.loc[us_acidents["State"] == 'DC']["State"])
us_acidents = us_acidents[us_acidents["State"] != 'DC']

In [None]:
replace_values = {'OH':'Ohio', 'WV':'West Virginia', 'CA': 'California', 'FL': 'Florida', 'GA': 'Georgia', 'SC':'South Carolina', 'NE': 'Nebraska', 'IA': 'Iowa', 'IL': 'Illinois', 'MO': 'Missouri', 'WI': 'Wisconsin',
       'IN': 'Indiana', 'MI': 'Michigan', 'NJ': 'New Jersey', 'NY': 'New York', 'CT': 'Connecticut', 'MA': 'Massachusetts', 'RI': 'Rhode Island', 'NH': 'New Hampshire', 'PA': 'Pennsylvania', 'KY': 'Kentucky', 'MD': 'Maryland',
       'VA': 'Virginia', 'DE': 'Delaware', 'TX':'Texas', 'WA': 'Washington', 'OR': 'Oregon', 'AL': 'Alabama', 'NC': 'North Carolina', 'MN': 'Minnesota', 'OK': 'Oklahoma', 'LA': 'Louisiana',
   'TN': 'Tennessee', 'UT': 'Utah', 'CO': 'Colorado', 'AZ': 'Arizona', 'NV': 'Nevada', 'KS': 'Kansas', 'MS': 'Mississippi', 'NM': 'New Mexico', 'ME': 'Maine', 'AR': 'Arkansas', 'WY': 'Wyoming','VT': 'Vermont', 'ID': 'Idaho', 'ND': 'North Dakota', 'MT': 'Montana', 'SD': 'South Dakota'}

In [None]:
us_acidents = us_acidents.replace({"State": replace_values})

In [None]:
# https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States#Census_Bureau-designated_regions_and_divisions
## define subregion
def sub_region(state):
    new_england = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont']
    mid_atlantic = ['New Jersey', 'New York','Pennsylvania']
    east_north_central = ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin']
    west_north_central = ['Iowa','Kansas','Minnesota','Missouri','Nebraska','North Dakota','South Dakota']
    south_atlantic = ['Delaware','Florida','Georgia','Maryland','North Carolina','South Carolina','Virginia','West Virginia']
    east_south_central = ['Alabama','Kentucky','Mississippi','Tennessee']
    west_south_central = ['Arkansas','Louisiana','Oklahoma','Texas']
    mountain = ['Arizona','Colorado','Idaho','Montana','Nevada','New Mexico','Utah','Wyoming']
    pacific = ['Alaska','California','Hawaii','Oregon','Washington']
    if state in new_england:
        return 'New England'
    elif state in mid_atlantic:
        return 'Mid Atlantic'
    elif state in east_north_central:
        return 'East North Central'
    elif state in west_north_central:
        return 'West North Central'
    elif state in south_atlantic:
        return 'South Atlantic'
    elif state in east_south_central:
        return 'East South Central'
    elif state in west_south_central:
        return 'West South Central'
    elif state in mountain:
        return 'Mountain'
    elif state in pacific:
        return 'Pacific'
    else:
        return 'algo de errado'

#define region
def region(state):
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York','Pennsylvania']
    midwest = ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa','Kansas','Minnesota','Missouri','Nebraska','North Dakota','South Dakota']
    south = ['Delaware','Florida','Georgia','Maryland','North Carolina','South Carolina','Virginia','District of Columbia','West Virginia', 'Alabama','Kentucky','Mississippi','Tennessee', 'Arkansas','Louisiana','Oklahoma','Texas']
    west = ['Arizona','Colorado','Idaho','Montana','Nevada','New Mexico','Utah','Wyoming', 'Alaska','California','Hawaii','Oregon','Washington']
    if state in northeast:
        return 'Northeast'
    elif state in midwest:
        return 'Midwest'
    elif state in south:
        return 'South'
    elif state in west:
        return 'West'
    else:
        return 'algo de errado'

In [None]:
# add Region
us_acidents.insert(us_acidents.columns.get_loc("Timezone"), 'Region', us_acidents['State'].apply(lambda x: region(x)))

# add subregion
us_acidents.insert(us_acidents.columns.get_loc("Timezone"), 'Subregion', us_acidents['State'].apply(lambda x: sub_region(x)))

##### Inserção de atributo AlcoholConsumptionRate

In [None]:
us_acidents["AlcoholConsumptionRate"] = us_acidents["State"].map(alcohol_dict)

In [None]:
us_acidents['AlcoholConsumptionRate'].value_counts()

#### Inserção do atributo NumberVehicleRegistrations

In [None]:
us_acidents["VehicleRegistrations"] = us_acidents["State"].map(vehicle_dict)

us_acidents['VehicleRegistrations'].value_counts()

#### Inserção do atributo UrbanRuralClassification

In [None]:
#Add Rate
us_acidents["CountyUrbanRuralClass"] = us_acidents["County"].map(urban_dict)

#~81000 rows dropped - Errors in County, only with manual correction allowed
us_acidents = us_acidents[us_acidents["CountyUrbanRuralClass"].notna()]

us_acidents['CountyUrbanRuralClass'].value_counts()

#### Inserção do atributo MaximumSpeedLimit

In [None]:
us_acidents["StateMaxSpeedLimit"] = us_acidents["State"].map(vehicle_dict)

us_acidents['StateMaxSpeedLimit'].value_counts()

TBD:
* MoonCalendar


#### Date Dimension

In [None]:
# convert to timestamp data 
us_acidents["Start_Time"]= pd.to_datetime(us_acidents["Start_Time"])

# convert to timestamp data type
us_acidents["End_Time"]= pd.to_datetime(us_acidents["End_Time"])

In [None]:
us_acidents.insert(us_acidents.columns.get_loc("Start_Time"), 'date_Key', range(1, 1 + len(us_acidents))) # add date_key

In [None]:
us_acidents.insert(us_acidents.columns.get_loc("End_Time"), 'StartDay', us_acidents['Start_Time'].apply(lambda x: x.day)) # add date_key
us_acidents.insert(us_acidents.columns.get_loc("End_Time"), 'StartMonth', us_acidents['Start_Time'].apply(lambda x: x.month_name())) # add date_key
us_acidents.insert(us_acidents.columns.get_loc("End_Time"), 'StartYear', us_acidents['Start_Time'].apply(lambda x: x.year)) # add date_key

In [None]:
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'EndDay', us_acidents['End_Time'].apply(lambda x: x.day)) # add date_key
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'EndMonth', us_acidents['End_Time'].apply(lambda x: x.month_name())) # add date_key
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'EndYear', us_acidents['End_Time'].apply(lambda x: x.year)) # add date_key

In [None]:
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'WorkDay', us_acidents['End_Time'].apply(lambda x: x.day_name())) # add WorkDay parcialmente concluído
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'DayofWeek', us_acidents['End_Time'].apply(lambda x: x.day_name())) # add DayofWeek
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'Holiday', range(1, 1 + len(us_acidents))) # add Holiday --->> Não concluído
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'SchoolBreak', range(1, 1 + len(us_acidents))) # add SchoolBreak --->> Não concluído
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'Quarter', us_acidents['End_Time'].apply(lambda x: x.quarter)) # add Quarter
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'WeekNumber', us_acidents['End_Time'].apply(lambda x: x.weekofyear)) # add WeekNumber
us_acidents.insert(us_acidents.columns.get_loc("Distance(mi)"), 'MoonCalendar', range(1, 1 + len(us_acidents))) # add MoonCalendar --->> Não concluído