In [681]:
import requests # Library to access URLs using Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date, timedelta
from datetime import datetime

# Wetter API

## Get Forcast weather data

### Get data

In [682]:
def build_query_string(base_url, params_dict):
    # Create a string of param1=value1&param2=value2&...
    params_str = "&".join(f"{k}={v}" for k,v in params_dict.items())
    # Concatenate this string with the base URL and return
    return f"{base_url}?{params_str}"

In [683]:
# Set our API key
api_key = "ba6cf0fabcb9f0ec1029c1a0a7949ec4"

In [684]:
# Let's start with the first step: finding the latitude and longitude for a given city

# Define the base URL of the geolocation API
geo_url = "http://api.openweathermap.org/geo/1.0/direct"

# Define the parameters of our query 
# (see the documentation: https://openweathermap.org/api/geocoding-api#direct)
geo_params = {
    "q": "Köln",
    "appid": api_key
}

# Create the API query URL
query = build_query_string(geo_url, geo_params)

# Obtain the data from the API call
geo_data = requests.get(query).json()

In [685]:
# Transform a date_string to its Unix time format
# Date format should be YYYY-MM-DD or specified accordingly
def date_to_unix(date_string, date_format="%Y-%m-%d"):
    date = dt.datetime.strptime(date_string, date_format)
    return int(dt.datetime.timestamp(date))

# It will also be useful to transf

In [686]:
# Define the base URL of the forecast API
forcast_url = "https://api.openweathermap.org/data/2.5//forecast"



# Define the parameters of our query to the air pollution API
airpol_params = {
    "lat": geo_data[0]["lat"], # The latitude of our chosen city
    "lon": geo_data[0]["lon"], # The longitude of our chosen city
    "appid": api_key, # Our API key
    "units": "metric"

}

weather = requests.get(build_query_string(forcast_url, airpol_params)).json()

In [687]:
# Extract a list of relevant dictionaries
observations_temp = []
observations_weather = []

for datapoint in weather["list"]:
    # Keep track of all air pollution variables
    obs_t = datapoint["main"]#["temp"]
    # Add the date to the dictionary as well
    obs_t["date"] = datapoint["dt"]
    # Add the observation to our full list of observations
    observations_temp.append(obs_t)
        # Keep track of all air pollution variables
    obs_w = datapoint["weather"][0]#["description"]
    # Add the date to the dictionary as well
    obs_w["date"] = datapoint["dt"]
    # Add the observation to our full list of observations
    observations_weather.append(obs_w)

In [688]:
# Saving data from API in a Dataframe to merge
df_t = pd.DataFrame.from_records(observations_temp)
df_w = pd.DataFrame.from_records(observations_weather)


In [689]:
# Merging data and changing date formate
df = pd.merge(df_t, df_w, how= "right", on = "date")
df["date"] = pd.to_datetime(df["date"], unit="s")

# buildng this as hourly with cloned rows
df_repeated = pd.concat([df]*3, ignore_index=True)
df_repeated.loc[[0,40,80]]

new_index = []


#Changing the Dataframe to every hour
for n in range(int(len(df_repeated)/3)):
    new_index.append(n)
    new_index.append(n+40)
    new_index.append(n+80)

df_repeated = df_repeated.reindex(new_index)
df_repeated.reset_index(inplace = True, drop = True)

# Getting a step by step change in temperature
for n in range(int(len(df_repeated)/3)-1):
    start_index = n*3
    end_index = (n*3)+3
    start = df_repeated.loc[start_index,"temp"]
    end = df_repeated.loc[end_index,"temp"]
    difference = end - start
    steps = round(difference/3,2)
    df_repeated.loc[start_index+1,"temp"] = df_repeated.loc[start_index,"temp"] + steps
    df_repeated.loc[start_index+2,"temp"] = df_repeated.loc[start_index,"temp"] + steps + steps

df_repeated = df_repeated.drop(df_repeated.iloc[:,1:9],axis = 1)
df_repeated


Unnamed: 0,temp,date,id,main,description,icon
0,6.58,2022-11-30 18:00:00,804,Clouds,overcast clouds,04n
1,6.58,2022-11-30 18:00:00,804,Clouds,overcast clouds,04n
2,6.58,2022-11-30 18:00:00,804,Clouds,overcast clouds,04n
3,6.59,2022-11-30 21:00:00,804,Clouds,overcast clouds,04n
4,6.33,2022-11-30 21:00:00,804,Clouds,overcast clouds,04n
...,...,...,...,...,...,...
115,3.19,2022-12-05 12:00:00,804,Clouds,overcast clouds,04d
116,2.87,2022-12-05 12:00:00,804,Clouds,overcast clouds,04d
117,2.55,2022-12-05 15:00:00,803,Clouds,broken clouds,04d
118,2.55,2022-12-05 15:00:00,803,Clouds,broken clouds,04d


### Prepearing hourly

In [690]:
# Setting the Hour intervals for the Dataframe
for n in range(int(len(df_repeated)/3)):
    start_index = n*3
    df_repeated.loc[start_index+1,"date"] = df_repeated.loc[start_index,"date"] + timedelta(hours=1)
    df_repeated.loc[start_index+2,"date"] = df_repeated.loc[start_index,"date"] + timedelta(hours=2)

# Splitting the date and time for a merge
df_repeated["date"] = df_repeated["date"].astype(str)
df_repeated[["date", "time"]] = df_repeated["date"].str.split(" ", n=1, expand=True)
df_repeated

Unnamed: 0,temp,date,id,main,description,icon
0,6.58,2022-11-30 18:00:00,804,Clouds,overcast clouds,04n
1,6.58,2022-11-30 19:00:00,804,Clouds,overcast clouds,04n
2,6.58,2022-11-30 20:00:00,804,Clouds,overcast clouds,04n
3,6.59,2022-11-30 21:00:00,804,Clouds,overcast clouds,04n
4,6.33,2022-11-30 22:00:00,804,Clouds,overcast clouds,04n
...,...,...,...,...,...,...
115,3.19,2022-12-05 13:00:00,804,Clouds,overcast clouds,04d
116,2.87,2022-12-05 14:00:00,804,Clouds,overcast clouds,04d
117,2.55,2022-12-05 15:00:00,803,Clouds,broken clouds,04d
118,2.55,2022-12-05 16:00:00,803,Clouds,broken clouds,04d


### School Holiday

In [692]:
#Import datapackages
from datetime import date, timedelta
from datetime import datetime

#Import Dataframe from the holidays
future_school_holi = pd.read_csv("future_school_holiday.csv", sep = ";")
days_df_list = []

#Create dataframe with all dates singular listed 
for i in range(future_school_holi.shape[0]):
    #Get the start and enddate from the holidays
    start_date = datetime.strptime(future_school_holi["Erster Ferientag"][i], "%Y-%m-%d").date()
    end_date = datetime.strptime(future_school_holi["Letzter Ferientag"][i], "%Y-%m-%d").date()

    #create dataframes out of the range and append to a list
    delta = end_date - start_date
    days = [start_date + timedelta(days=n) for n in range(delta.days + 1)]
    days_df = pd.DataFrame (days, columns = ["date"])
    days_df_list.append(days_df)

#create single dataframe out of the list and add value 1 for holiday
new_holi = pd.concat(days_df_list).reset_index(drop=True)
new_holi["school holiday"] = 1

#Chagne to str for merge
new_holi["date"] = new_holi["date"].astype(str)





In [693]:
# Merging the weather data an the holidays
df_merge1 = pd.merge(df_repeated, new_holi, how= "left", on = "date")
# Filling cells with no holiday with a 0
df_merge1["school holiday"] = df_merge1["school holiday"].fillna(0)
# change float to int
df_merge1["school holiday"] = np.int64(df_merge1["school holiday"].astype(int))
df_merge1

### National Holiday

In [695]:
from datetime import date, timedelta

# Getting the current year 
today = dt.date.today()
year = int(today.strftime("%Y"))

# Putting this and the next year into a list to claculate the holidays
this_plus_next_year = [year,year + 1]

future_holidays_list = []

for intput_year in this_plus_next_year:
    #Formular for calculating easter sunday
    k = intput_year // 100
    m = 15 + (3 * k + 3) // 4 - (8 * k + 13) // 25
    s = 2 - (3*k + 3) // 4
    a = intput_year % 19

    d = (19* a + m) % 30
    r = (d + a // 11) // 29
    og = 21 + d - r
    sz = 7 - (intput_year + intput_year // 4 + s) % 7

    oe =  7 - (og - sz) % 7

    #days for easter sunday after teh first march (because daytime starts at first minus 1 again)
    os = og + oe -1
    first_march = date(intput_year, 3, 1)

    # calculating the date for easter sunday
    os_date = first_march + timedelta(days=os)


    # The flexible holidays calculated from easter sunday
    karfreitag = os_date - timedelta(days=2)
    os_date = os_date
    easter_monday = os_date + timedelta(days=1)
    ascension = os_date + timedelta(days=39)
    white_monday = os_date + timedelta(days=50)
    corpus_christ = os_date + timedelta(days=60)

    # The fixed holidays (for nrw)
    new_year = date(intput_year, 1, 1)
    day_of_work = date(intput_year, 5, 1)
    germany_united = date(intput_year, 10, 3)
    saints_day = date(intput_year, 11, 1)
    first_christmas = date(intput_year, 12, 25)
    second_christmas =date(intput_year, 11, 26)

    # Creating dataframe with holidays
    holiday_list_future = [new_year, karfreitag, os_date, easter_monday, day_of_work, ascension, white_monday, corpus_christ, germany_united, saints_day, first_christmas, second_christmas]
    future_holidays = pd.DataFrame ({"date": holiday_list_future,
                                    "holiday": [1 for i in range(len(holiday_list_future))]})
    future_holidays_list.append(future_holidays)

# connect the holidays from this and the nex year
df_future_holidays = pd.concat(future_holidays_list)

In [696]:
# Merging the holidays with the alredy merged dataframe
df_merge2 = pd.merge(df_merge1, df_future_holidays, how= "left", on = "date")
# Fill no holidays with zeros
df_merge2["holiday"] = df_merge2["holiday"].fillna(0)
#change float to int
df_merge2["holiday"] = np.int64(df_merge2["holiday"].astype(int))

df_merge2

### Weather

In [698]:
#Values from weather and key from ped. data
#Athomsphere is not exectly fog but also and the rest does not accure in Köln
# 
######################################################
#ICH HABE WIND UND NIESELREGEN GLEICHGESETZT?? UND THUNDERSTORM AGAIN WITH RIAN
######################################################
dict_weather = {'rain':"Thunderstorm",
                'wind':"Drizzle",
                'rain':"Rain",
                "snow":"Snow",
                "fog":'Atmosphere',
                "clear-night":"Clear",
                "clear-day":"Clear",
                "cloudy":"Clouds",
                "partly-cloudy-night":"Clouds",
                "partly-cloudy-day":"Clouds",
                "nan":"nan"}

for weather_condition in dict_weather.values():
    df_merge2[weather_condition] = 0


# Changing the 0 to a 1 in the Weather Columns
for i in range(len(df_merge2)):
    weather = df_merge2.loc[i, "main"]
    for weather_types in dict_weather.values():
        if weather == weather_types:
            df_merge2.loc[i, weather_types] = 1

df_merge2


Unnamed: 0,temp,date,id,main,description,icon,time,school holiday,holiday,Rain,Drizzle,Snow,Atmosphere,Clear,Clouds,nan
0,6.58,2022-11-30,804,Clouds,overcast clouds,04n,18:00:00,0,0,0,0,0,0,0,0,0
1,6.58,2022-11-30,804,Clouds,overcast clouds,04n,19:00:00,0,0,0,0,0,0,0,0,0
2,6.58,2022-11-30,804,Clouds,overcast clouds,04n,20:00:00,0,0,0,0,0,0,0,0,0
3,6.59,2022-11-30,804,Clouds,overcast clouds,04n,21:00:00,0,0,0,0,0,0,0,0,0
4,6.33,2022-11-30,804,Clouds,overcast clouds,04n,22:00:00,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,3.19,2022-12-05,804,Clouds,overcast clouds,04d,13:00:00,0,0,0,0,0,0,0,0,0
116,2.87,2022-12-05,804,Clouds,overcast clouds,04d,14:00:00,0,0,0,0,0,0,0,0,0
117,2.55,2022-12-05,803,Clouds,broken clouds,04d,15:00:00,0,0,0,0,0,0,0,0,0
118,2.55,2022-12-05,803,Clouds,broken clouds,04d,16:00:00,0,0,0,0,0,0,0,0,0


### Months

In [700]:
#Get Column with months
df_merge2["months"] = df_merge2["date"].str.slice(5, 7)

#Transform time to hours
df_merge2["hour"] = df_merge2["time"].str.slice(0, 2)

# Create a list wiht all months
Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

#Create columns for months
for month in Months:
    df_merge2[month] = 0


# Changing the 0 to a 1 in the Month Columns
for i in range(len(df_merge2)):
    month = df_merge2.loc[i, "months"]
    for number_months in range(len(Months)):
        number_months_str = str(number_months+1).zfill(2)
        if month == number_months_str:
            df_merge2.loc[i, Months[number_months]] = 1
df_merge2

### Weekdays

In [703]:
#Create list with weekdays
Weekdays = ["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]

for weekday in Weekdays:
    df_merge2[weekday] = 0

#Getting the Weekdays and changing them in the right column to a 1
for i in range(len(df_merge2)):
    day = datetime.strptime(df_merge2.loc[i, "date"], "%Y-%m-%d").strftime('%A')
    for weekday in Weekdays:
        if day == weekday:
            df_merge2.loc[i, weekday] = 1


df_merge2

Unnamed: 0,temp,date,id,main,description,icon,time,school holiday,holiday,Rain,...,October,November,December,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,6.58,2022-11-30,804,Clouds,overcast clouds,04n,18:00:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,6.58,2022-11-30,804,Clouds,overcast clouds,04n,19:00:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,6.58,2022-11-30,804,Clouds,overcast clouds,04n,20:00:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,6.59,2022-11-30,804,Clouds,overcast clouds,04n,21:00:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,6.33,2022-11-30,804,Clouds,overcast clouds,04n,22:00:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,3.19,2022-12-05,804,Clouds,overcast clouds,04d,13:00:00,0,0,0,...,0,0,1,0,0,0,0,0,0,0
116,2.87,2022-12-05,804,Clouds,overcast clouds,04d,14:00:00,0,0,0,...,0,0,1,0,0,0,0,0,0,0
117,2.55,2022-12-05,803,Clouds,broken clouds,04d,15:00:00,0,0,0,...,0,0,1,0,0,0,0,0,0,0
118,2.55,2022-12-05,803,Clouds,broken clouds,04d,16:00:00,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Hours 

In [705]:
# Creating columns with the hours
hour_in_day =  list(df_merge2["time"].unique())
hour_in_day.sort()
for time in hour_in_day:
    df_merge2[time] = 0

# Changing the hours form zero to 1 for the right hours
for i in range(len(df_merge2)):
    time = df_merge2.loc[i, "time"]
    for hour in hour_in_day:
        if time == hour:
            df_merge2.loc[i, time] = 1
df_merge2

## Pedestirants prepearing in the same way

### General

In [707]:
import pandas as pd
import numpy as np # Numerical computation package
np.random.seed(1) # Set the random seed for reproduceability

ped = pd.read_csv("köln-schildergasse (west)-20180430-20200131-hour.csv", sep = ";")
holi = pd.read_csv("Feiertage_2018.01.01_2020.01.31.csv", sep = ";")
##########################################print(ped.head(3))
##########################################print(holi.head(3))
#test


#########################################################################
#Machen wir das in eine Neue Box oder nicht? Die Print sachen können wir sonst raus nehmen?
#########################################################################


#Splitting up the Time of measurement into date and time
ped[["date", "time"]] = ped["time of measurement"].str.split(" ", n=1, expand=True)
#Drpo Time of measuremnt, because it is split now and drop location
ped = ped.drop(["time of measurement", "location"], axis=1)
#Drop incidents, if only nan
ped.dropna(how='all', axis=1, inplace=True)

# restructure by date
ped = ped[ ["date", "time"] + [ col for col in ped.columns if col != "date" and col!= "time"] ]
#change name
holi = holi.rename(columns={"name": "holiday"})
#merge on date
ped = pd.merge(ped, holi, how = "left")

#put hollyday as 1 and no holyday as 0
ped["holiday"] = ped["holiday"].fillna(0)
ped["holiday"] = ped["holiday"].where(ped["holiday"] == 0, 1)

# clear time from the minute variation
ped["time"] = ped["time"].str.slice_replace(8, 16)

# Get a better insight into the weather data and missing data
##########################################print(ped["weather condition"].unique())
##########################################print(ped.isnull().sum() / ped.shape[0])

#See where the data are missing
null_data = ped[ped.isnull().any(axis=1)]
##########################################print(null_data)


#########################################################################
#Machen wir das in eine Neue Box oder nicht? Die Print sachen können wir sonst raus nehmen?
#########################################################################


#because the data for temperature and weather condition are both missing, this can be dropped, due to the huge sample size
ped.dropna(inplace=True)

#Import datapackages
from datetime import date, timedelta
from datetime import datetime

#Import Dataframe from the holidays
school_holi = pd.read_csv("OpenData_Ferientermine.csv", sep = ";")
days_df_list = []

#Create dataframe with all dates singular listed 
for i in range(school_holi.shape[0]):
    #Get the start and enddate from the holidays
    start_date = datetime.strptime(school_holi["ErsterTagDate"][i], "%d.%m.%Y").date()
    end_date = datetime.strptime(school_holi["LetzterTagDate"][i], "%d.%m.%Y").date()

    #create dataframes out of the range and append to a list
    delta = end_date - start_date
    days = [start_date + timedelta(days=n) for n in range(delta.days + 1)]
    days_df = pd.DataFrame (days, columns = ['date'])
    days_df_list.append(days_df)

#create single dataframe out of the list and add value 1 for holiday
new_holi = pd.concat(days_df_list).reset_index(drop=True)
new_holi["school holiday"] = 1
new_holi["date"] = new_holi["date"].astype(str)

#merg of the two frames
ped = pd.merge(ped, new_holi, how = "left", on="date")
ped["school holiday"] = ped["school holiday"].fillna(0)

#change float to int
ped["school holiday"] = np.int64(ped["school holiday"].astype(int))

#Get Column with months
ped["months"] = ped["date"].str.slice(5, 7)

#Transform time to hours
ped["hour"] = ped["time"].str.slice(0, 2)



### Weekdays

In [708]:
for weekday in Weekdays:
    ped[weekday] = 0


In [709]:
#Getting the Weekdays
for i in range(len(ped)):
    day = ped.loc[i, "weekday"]
    for weekday in Weekdays:
        if day == weekday:
            ped.loc[i, weekday] = 1

ped

Unnamed: 0,date,time,weekday,pedestrians count,temperature in ºc,weather condition,holiday,school holiday,months,hour,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,2018-05-01,00:00:00,Tuesday,0,8.0,partly-cloudy-night,1,0,05,00,0,1,0,0,0,0,0
1,2018-05-01,01:00:00,Tuesday,0,7.0,partly-cloudy-night,1,0,05,01,0,1,0,0,0,0,0
2,2018-05-01,02:00:00,Tuesday,146,7.0,partly-cloudy-night,1,0,05,02,0,1,0,0,0,0,0
3,2018-05-01,03:00:00,Tuesday,125,7.0,partly-cloudy-night,1,0,05,03,0,1,0,0,0,0,0
4,2018-05-01,04:00:00,Tuesday,84,6.0,partly-cloudy-night,1,0,05,04,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15353,2020-01-31,19:00:00,Friday,5422,11.0,partly-cloudy-night,0,0,01,19,0,0,0,0,1,0,0
15354,2020-01-31,20:00:00,Friday,2691,11.0,partly-cloudy-night,0,0,01,20,0,0,0,0,1,0,0
15355,2020-01-31,21:00:00,Friday,1329,10.0,partly-cloudy-night,0,0,01,21,0,0,0,0,1,0,0
15356,2020-01-31,22:00:00,Friday,915,11.0,cloudy,0,0,01,22,0,0,0,0,1,0,0


### Months

In [710]:
Months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
for month in Months:
    ped[month] = 0


In [711]:
# Changing the 0 to a 1 in the Month Columns
for i in range(len(ped)):
    month = ped.loc[i, "months"]
    for number_months in range(len(Months)):
        number_months_str = str(number_months+1).zfill(2)
        if month == number_months_str:
            ped.loc[i, Months[number_months]] = 1
ped  

Unnamed: 0,date,time,weekday,pedestrians count,temperature in ºc,weather condition,holiday,school holiday,months,hour,...,March,April,May,June,July,August,September,October,November,December
0,2018-05-01,00:00:00,Tuesday,0,8.0,partly-cloudy-night,1,0,05,00,...,0,0,1,0,0,0,0,0,0,0
1,2018-05-01,01:00:00,Tuesday,0,7.0,partly-cloudy-night,1,0,05,01,...,0,0,1,0,0,0,0,0,0,0
2,2018-05-01,02:00:00,Tuesday,146,7.0,partly-cloudy-night,1,0,05,02,...,0,0,1,0,0,0,0,0,0,0
3,2018-05-01,03:00:00,Tuesday,125,7.0,partly-cloudy-night,1,0,05,03,...,0,0,1,0,0,0,0,0,0,0
4,2018-05-01,04:00:00,Tuesday,84,6.0,partly-cloudy-night,1,0,05,04,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15353,2020-01-31,19:00:00,Friday,5422,11.0,partly-cloudy-night,0,0,01,19,...,0,0,0,0,0,0,0,0,0,0
15354,2020-01-31,20:00:00,Friday,2691,11.0,partly-cloudy-night,0,0,01,20,...,0,0,0,0,0,0,0,0,0,0
15355,2020-01-31,21:00:00,Friday,1329,10.0,partly-cloudy-night,0,0,01,21,...,0,0,0,0,0,0,0,0,0,0
15356,2020-01-31,22:00:00,Friday,915,11.0,cloudy,0,0,01,22,...,0,0,0,0,0,0,0,0,0,0


### Hour

In [712]:
hour_in_day =  list(ped["time"].unique())
hour_in_day.sort()
for time in hour_in_day:
    ped[time] = 0

In [713]:
for i in range(len(ped)):
    time = ped.loc[i, "time"]
    for hour in hour_in_day:
        if time == hour:
            ped.loc[i, time] = 1
ped

Unnamed: 0,date,time,weekday,pedestrians count,temperature in ºc,weather condition,holiday,school holiday,months,hour,...,14:00:00,15:00:00,16:00:00,17:00:00,18:00:00,19:00:00,20:00:00,21:00:00,22:00:00,23:00:00
0,2018-05-01,00:00:00,Tuesday,0,8.0,partly-cloudy-night,1,0,05,00,...,0,0,0,0,0,0,0,0,0,0
1,2018-05-01,01:00:00,Tuesday,0,7.0,partly-cloudy-night,1,0,05,01,...,0,0,0,0,0,0,0,0,0,0
2,2018-05-01,02:00:00,Tuesday,146,7.0,partly-cloudy-night,1,0,05,02,...,0,0,0,0,0,0,0,0,0,0
3,2018-05-01,03:00:00,Tuesday,125,7.0,partly-cloudy-night,1,0,05,03,...,0,0,0,0,0,0,0,0,0,0
4,2018-05-01,04:00:00,Tuesday,84,6.0,partly-cloudy-night,1,0,05,04,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15353,2020-01-31,19:00:00,Friday,5422,11.0,partly-cloudy-night,0,0,01,19,...,0,0,0,0,0,1,0,0,0,0
15354,2020-01-31,20:00:00,Friday,2691,11.0,partly-cloudy-night,0,0,01,20,...,0,0,0,0,0,0,1,0,0,0
15355,2020-01-31,21:00:00,Friday,1329,10.0,partly-cloudy-night,0,0,01,21,...,0,0,0,0,0,0,0,1,0,0
15356,2020-01-31,22:00:00,Friday,915,11.0,cloudy,0,0,01,22,...,0,0,0,0,0,0,0,0,1,0


### Weather

In [714]:
#Keys from weather and value from ped. data
#Athomsphere is not exectly fog but also and the rest does not accure in Köln
# 
######################################################
#ICH HABE WIND UND NIESELREGEN GLEICHGESETZT?? UND THUNDERSTORM AGAIN WITH RIAN
######################################################
dict_weather = {'rain':"Thunderstorm",
                'wind':"Drizzle",
                'rain':"Rain",
                "snow":"Snow",
                "fog":'Atmosphere',
                "clear-night":"Clear",
                "clear-day":"Clear",
                "cloudy":"Clouds",
                "partly-cloudy-night":"Clouds",
                "partly-cloudy-day":"Clouds",
                "nan":"nan"}

for weather_condition in dict_weather.values():
    ped[weather_condition] = 0

ped

Unnamed: 0,date,time,weekday,pedestrians count,temperature in ºc,weather condition,holiday,school holiday,months,hour,...,21:00:00,22:00:00,23:00:00,Rain,Drizzle,Snow,Atmosphere,Clear,Clouds,nan
0,2018-05-01,00:00:00,Tuesday,0,8.0,partly-cloudy-night,1,0,05,00,...,0,0,0,0,0,0,0,0,0,0
1,2018-05-01,01:00:00,Tuesday,0,7.0,partly-cloudy-night,1,0,05,01,...,0,0,0,0,0,0,0,0,0,0
2,2018-05-01,02:00:00,Tuesday,146,7.0,partly-cloudy-night,1,0,05,02,...,0,0,0,0,0,0,0,0,0,0
3,2018-05-01,03:00:00,Tuesday,125,7.0,partly-cloudy-night,1,0,05,03,...,0,0,0,0,0,0,0,0,0,0
4,2018-05-01,04:00:00,Tuesday,84,6.0,partly-cloudy-night,1,0,05,04,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15353,2020-01-31,19:00:00,Friday,5422,11.0,partly-cloudy-night,0,0,01,19,...,0,0,0,0,0,0,0,0,0,0
15354,2020-01-31,20:00:00,Friday,2691,11.0,partly-cloudy-night,0,0,01,20,...,0,0,0,0,0,0,0,0,0,0
15355,2020-01-31,21:00:00,Friday,1329,10.0,partly-cloudy-night,0,0,01,21,...,1,0,0,0,0,0,0,0,0,0
15356,2020-01-31,22:00:00,Friday,915,11.0,cloudy,0,0,01,22,...,0,1,0,0,0,0,0,0,0,0


In [715]:
#Changing the 0 to a 1 in the Weather Columns
for i in range(len(ped)):
    weather = ped.loc[i, "weather condition"]
    for weather_types in dict_weather.keys():
        if weather == weather_types:
            value = next((v for k, v in dict_weather.items() if k == weather), "nan")
            ped.loc[i, value] = 1
        

ped

Unnamed: 0,date,time,weekday,pedestrians count,temperature in ºc,weather condition,holiday,school holiday,months,hour,...,21:00:00,22:00:00,23:00:00,Rain,Drizzle,Snow,Atmosphere,Clear,Clouds,nan
0,2018-05-01,00:00:00,Tuesday,0,8.0,partly-cloudy-night,1,0,05,00,...,0,0,0,0,0,0,0,0,1,0
1,2018-05-01,01:00:00,Tuesday,0,7.0,partly-cloudy-night,1,0,05,01,...,0,0,0,0,0,0,0,0,1,0
2,2018-05-01,02:00:00,Tuesday,146,7.0,partly-cloudy-night,1,0,05,02,...,0,0,0,0,0,0,0,0,1,0
3,2018-05-01,03:00:00,Tuesday,125,7.0,partly-cloudy-night,1,0,05,03,...,0,0,0,0,0,0,0,0,1,0
4,2018-05-01,04:00:00,Tuesday,84,6.0,partly-cloudy-night,1,0,05,04,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15353,2020-01-31,19:00:00,Friday,5422,11.0,partly-cloudy-night,0,0,01,19,...,0,0,0,0,0,0,0,0,1,0
15354,2020-01-31,20:00:00,Friday,2691,11.0,partly-cloudy-night,0,0,01,20,...,0,0,0,0,0,0,0,0,1,0
15355,2020-01-31,21:00:00,Friday,1329,10.0,partly-cloudy-night,0,0,01,21,...,1,0,0,0,0,0,0,0,1,0
15356,2020-01-31,22:00:00,Friday,915,11.0,cloudy,0,0,01,22,...,0,1,0,0,0,0,0,0,1,0


### Temperature

In [716]:
standardize = lambda x: (x - x.mean()) / x.std()
mean_used = ped["temperature in ºc"].mean()
std_used = ped["temperature in ºc"].std()
ped["temperature in ºc"] = standardize(ped["temperature in ºc"])
ped

Unnamed: 0,date,time,weekday,pedestrians count,temperature in ºc,weather condition,holiday,school holiday,months,hour,...,21:00:00,22:00:00,23:00:00,Rain,Drizzle,Snow,Atmosphere,Clear,Clouds,nan
0,2018-05-01,00:00:00,Tuesday,0,-0.618529,partly-cloudy-night,1,0,05,00,...,0,0,0,0,0,0,0,0,1,0
1,2018-05-01,01:00:00,Tuesday,0,-0.748505,partly-cloudy-night,1,0,05,01,...,0,0,0,0,0,0,0,0,1,0
2,2018-05-01,02:00:00,Tuesday,146,-0.748505,partly-cloudy-night,1,0,05,02,...,0,0,0,0,0,0,0,0,1,0
3,2018-05-01,03:00:00,Tuesday,125,-0.748505,partly-cloudy-night,1,0,05,03,...,0,0,0,0,0,0,0,0,1,0
4,2018-05-01,04:00:00,Tuesday,84,-0.878482,partly-cloudy-night,1,0,05,04,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15353,2020-01-31,19:00:00,Friday,5422,-0.228598,partly-cloudy-night,0,0,01,19,...,0,0,0,0,0,0,0,0,1,0
15354,2020-01-31,20:00:00,Friday,2691,-0.228598,partly-cloudy-night,0,0,01,20,...,0,0,0,0,0,0,0,0,1,0
15355,2020-01-31,21:00:00,Friday,1329,-0.358575,partly-cloudy-night,0,0,01,21,...,1,0,0,0,0,0,0,0,1,0
15356,2020-01-31,22:00:00,Friday,915,-0.228598,cloudy,0,0,01,22,...,0,1,0,0,0,0,0,0,1,0


## Final

### Standardize Temperature Forecast

In [718]:
# Using the standardization form the training data
df_merge2["temp"] = (df_merge2["temp"] - mean_used) / std_used

df_merge2

Unnamed: 0,temp,date,id,main,description,icon,time,school holiday,holiday,Rain,...,14:00:00,15:00:00,16:00:00,17:00:00,18:00:00,19:00:00,20:00:00,21:00:00,22:00:00,23:00:00
0,-1.762728,2022-11-30,804,Clouds,overcast clouds,04n,18:00:00,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,-1.762728,2022-11-30,804,Clouds,overcast clouds,04n,19:00:00,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,-1.762728,2022-11-30,804,Clouds,overcast clouds,04n,20:00:00,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,-1.762559,2022-11-30,804,Clouds,overcast clouds,04n,21:00:00,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-1.766951,2022-11-30,804,Clouds,overcast clouds,04n,22:00:00,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,-1.819998,2022-12-05,804,Clouds,overcast clouds,04d,13:00:00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
116,-1.825404,2022-12-05,804,Clouds,overcast clouds,04d,14:00:00,0,0,0,...,1,0,0,0,0,0,0,0,0,0
117,-1.830811,2022-12-05,803,Clouds,broken clouds,04d,15:00:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
118,-1.830811,2022-12-05,803,Clouds,broken clouds,04d,16:00:00,0,0,0,...,0,0,1,0,0,0,0,0,0,0
