In [149]:
! pip install --quiet matplotlib numpy pandas scikit-learn seaborn pyarrow


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Daily weather file
### Preprocessing

In [151]:
df_weather_daily = pd.read_csv(r'..\data\00_raw\weather_daily_darksky.csv')

In [152]:
df_weather_daily.head()

Unnamed: 0,temperatureMax,temperatureMaxTime,windBearing,icon,dewPoint,temperatureMinTime,cloudCover,windSpeed,pressure,apparentTemperatureMinTime,...,temperatureHigh,sunriseTime,temperatureHighTime,uvIndexTime,summary,temperatureLowTime,apparentTemperatureMin,apparentTemperatureMaxTime,apparentTemperatureLowTime,moonPhase
0,11.96,2011-11-11 23:00:00,123,fog,9.4,2011-11-11 07:00:00,0.79,3.88,1016.08,2011-11-11 07:00:00,...,10.87,2011-11-11 07:12:14,2011-11-11 19:00:00,2011-11-11 11:00:00,Foggy until afternoon.,2011-11-11 19:00:00,6.48,2011-11-11 23:00:00,2011-11-11 19:00:00,0.52
1,8.59,2011-12-11 14:00:00,198,partly-cloudy-day,4.49,2011-12-11 01:00:00,0.56,3.94,1007.71,2011-12-11 02:00:00,...,8.59,2011-12-11 07:57:02,2011-12-11 14:00:00,2011-12-11 12:00:00,Partly cloudy throughout the day.,2011-12-12 07:00:00,0.11,2011-12-11 20:00:00,2011-12-12 08:00:00,0.53
2,10.33,2011-12-27 02:00:00,225,partly-cloudy-day,5.47,2011-12-27 23:00:00,0.85,3.54,1032.76,2011-12-27 22:00:00,...,10.33,2011-12-27 08:07:06,2011-12-27 14:00:00,2011-12-27 00:00:00,Mostly cloudy throughout the day.,2011-12-27 23:00:00,5.59,2011-12-27 02:00:00,2011-12-28 00:00:00,0.1
3,8.07,2011-12-02 23:00:00,232,wind,3.69,2011-12-02 07:00:00,0.32,3.0,1012.12,2011-12-02 07:00:00,...,7.36,2011-12-02 07:46:09,2011-12-02 12:00:00,2011-12-02 10:00:00,Partly cloudy throughout the day and breezy ov...,2011-12-02 19:00:00,0.46,2011-12-02 12:00:00,2011-12-02 19:00:00,0.25
4,8.22,2011-12-24 23:00:00,252,partly-cloudy-night,2.79,2011-12-24 07:00:00,0.37,4.46,1028.17,2011-12-24 07:00:00,...,7.93,2011-12-24 08:06:15,2011-12-24 15:00:00,2011-12-24 13:00:00,Mostly cloudy throughout the day.,2011-12-24 19:00:00,-0.51,2011-12-24 23:00:00,2011-12-24 20:00:00,0.99


### Preprocessing

The transformations will be done in the preprocessing.py script

In [153]:
datetime_cols = [col for col in df_weather_daily.columns if 'time' in col.lower()]
for col in datetime_cols:
    df_weather_daily[col] = pd.to_datetime(df_weather_daily[col])

In [154]:
df_weather_daily = df_weather_daily.sort_values(by='time', ascending=True)
df_weather_daily = df_weather_daily.set_index('time')

In [155]:
df_weather_daily.head()

Unnamed: 0_level_0,temperatureMax,temperatureMaxTime,windBearing,icon,dewPoint,temperatureMinTime,cloudCover,windSpeed,pressure,apparentTemperatureMinTime,...,temperatureHigh,sunriseTime,temperatureHighTime,uvIndexTime,summary,temperatureLowTime,apparentTemperatureMin,apparentTemperatureMaxTime,apparentTemperatureLowTime,moonPhase
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-11-01,15.57,2011-11-01 15:00:00,208,partly-cloudy-day,10.13,2011-11-01 22:00:00,0.36,2.45,1009.46,2011-11-01 22:00:00,...,15.57,2011-11-01 06:54:29,2011-11-01 15:00:00,2011-11-01 10:00:00,Partly cloudy until evening.,2011-11-02 03:00:00,9.01,2011-11-01 15:00:00,2011-11-02 03:00:00,0.21
2011-11-02,15.19,2011-11-02 23:00:00,134,partly-cloudy-night,10.23,2011-11-02 03:00:00,0.39,4.39,1004.79,2011-11-02 03:00:00,...,15.06,2011-11-02 06:56:16,2011-11-02 13:00:00,2011-11-02 09:00:00,Partly cloudy throughout the day.,2011-11-02 19:00:00,7.33,2011-11-02 23:00:00,2011-11-02 19:00:00,0.24
2011-11-03,17.41,2011-11-03 14:00:00,154,partly-cloudy-day,13.39,2011-11-03 21:00:00,0.52,3.99,993.4,2011-11-03 21:00:00,...,17.41,2011-11-03 06:58:03,2011-11-03 14:00:00,2011-11-03 10:00:00,Partly cloudy throughout the day.,2011-11-04 07:00:00,12.79,2011-11-03 14:00:00,2011-11-04 07:00:00,0.27
2011-11-04,15.54,2011-11-04 11:00:00,179,fog,12.03,2011-11-04 23:00:00,0.5,2.62,995.54,2011-11-04 23:00:00,...,15.54,2011-11-04 06:59:49,2011-11-04 11:00:00,2011-11-04 10:00:00,Foggy overnight.,2011-11-05 02:00:00,11.53,2011-11-04 11:00:00,2011-11-05 02:00:00,0.31
2011-11-05,13.94,2011-11-05 15:00:00,346,fog,10.96,2011-11-05 02:00:00,0.65,2.7,1007.39,2011-11-05 02:00:00,...,13.94,2011-11-05 07:01:36,2011-11-05 15:00:00,2011-11-05 10:00:00,Foggy in the morning.,2011-11-06 05:00:00,10.17,2011-11-05 15:00:00,2011-11-06 06:00:00,0.34


In [156]:
df_weather_daily.isna().sum()

temperatureMax                 0
temperatureMaxTime             0
windBearing                    0
icon                           0
dewPoint                       0
temperatureMinTime             0
cloudCover                     1
windSpeed                      0
pressure                       0
apparentTemperatureMinTime     0
apparentTemperatureHigh        0
precipType                     0
visibility                     0
humidity                       0
apparentTemperatureHighTime    0
apparentTemperatureLow         0
apparentTemperatureMax         0
uvIndex                        1
sunsetTime                     0
temperatureLow                 0
temperatureMin                 0
temperatureHigh                0
sunriseTime                    0
temperatureHighTime            0
uvIndexTime                    1
summary                        0
temperatureLowTime             0
apparentTemperatureMin         0
apparentTemperatureMaxTime     0
apparentTemperatureLowTime     0
moonPhase 

In [157]:
miss_day = df_weather_daily[df_weather_daily.isna().any(axis=1)].index
miss_day

DatetimeIndex(['2014-01-01'], dtype='datetime64[ns]', name='time', freq=None)

In [197]:
df_days_missing = pd.DataFrame({'days_missing': miss_day.date})

In [158]:
df_weather_daily = df_weather_daily.drop(miss_day)

In [159]:
df_weather_daily.icon.value_counts()

icon
partly-cloudy-day      619
wind                   123
fog                     91
partly-cloudy-night     33
cloudy                   9
clear-day                6
Name: count, dtype: int64

In [160]:
df_weather_daily.icon = pd.Categorical(df_weather_daily.icon)

In [161]:
df_weather_daily.summary.value_counts()

summary
Mostly cloudy throughout the day.                             174
Partly cloudy throughout the day.                             170
Partly cloudy until evening.                                  133
Mostly cloudy until evening.                                  118
Foggy in the morning.                                          47
                                                             ... 
Mostly cloudy until evening and breezy throughout the day.      1
Overcast until evening and windy starting in the evening.       1
Windy and mostly cloudy until afternoon.                        1
Partly cloudy in the evening.                                   1
Overcast until afternoon.                                       1
Name: count, Length: 87, dtype: int64

In [162]:
df_weather_daily = df_weather_daily.drop(columns='summary')

In [163]:
datetime_cols.remove('time')
datetime_cols

['temperatureMaxTime',
 'temperatureMinTime',
 'apparentTemperatureMinTime',
 'apparentTemperatureHighTime',
 'sunsetTime',
 'sunriseTime',
 'temperatureHighTime',
 'uvIndexTime',
 'temperatureLowTime',
 'apparentTemperatureMaxTime',
 'apparentTemperatureLowTime']

In [164]:
df_weather_daily = df_weather_daily.drop(columns=datetime_cols)

In [165]:
df_weather_daily.precipType.value_counts()

precipType
rain    861
snow     20
Name: count, dtype: int64

In [166]:
df_weather_daily.precipType = pd.Categorical(df_weather_daily.precipType)

In [167]:
df_weather_daily.to_parquet(r'..\data\01_interim\weather_daily_darksky_cleaned.parquet', index=True)

## Temperature file

### Extraction

In [168]:
df_temperature = pd.read_csv(r'..\data\00_raw\temperatures.csv',sep=';',decimal=',')

In [169]:
df_temperature.head()

Unnamed: 0,DateTime,Temperature
0,2011-11-24,9.44
1,2011-11-24 00:30:00,9.44
2,2011-11-24 01:00:00,9.44
3,2011-11-24 01:30:00,9.165
4,2011-11-24 02:00:00,8.89


In [174]:
df_temperature.shape

(39696, 1)

In [170]:
df_temperature.isna().sum()

DateTime         0
Temperature    252
dtype: int64

In [171]:
mask = df_temperature["DateTime"].str.len() == 10

df_temperature.loc[mask, "DateTime"] = (
    df_temperature.loc[mask, "DateTime"] + " 00:00:00"
)

In [172]:
df_temperature.DateTime = pd.to_datetime(df_temperature.DateTime)
df_temperature = df_temperature.set_index('DateTime')

In [190]:
missing_temp_day = df_temperature[df_temperature.Temperature.isna()].index.to_series().dt.date.value_counts()
missing_temp_day

DateTime
2013-09-09    46
2011-12-31    42
2011-12-30    25
2012-01-01    21
2012-01-02    19
2012-01-03    12
2013-11-08    11
2013-08-16     7
2013-06-09     5
2012-05-17     5
2012-10-26     3
2012-05-07     3
2012-06-09     3
2012-07-16     3
2011-12-09     3
2013-05-30     3
2013-02-06     3
2013-01-03     3
2012-12-31     3
2013-06-11     3
2013-06-18     3
2013-06-08     3
2013-06-06     3
2014-01-01     3
2013-12-03     3
2013-09-13     3
2013-08-25     3
2014-01-21     3
2013-08-01     2
2013-09-10     2
2013-07-31     1
Name: count, dtype: int64

In [192]:
missing_temp_day[missing_temp_day >7].index

Index([2013-09-09, 2011-12-31, 2011-12-30, 2012-01-01, 2012-01-02, 2012-01-03,
       2013-11-08],
      dtype='object', name='DateTime')

In [199]:
pd.concat([df_days_missing, missing_temp_day[missing_temp_day >7].index.to_series(name='days_missing')]).reset_index(drop=True)

Unnamed: 0,days_missing
0,2014-01-01
1,2013-09-09
2,2011-12-31
3,2011-12-30
4,2012-01-01
5,2012-01-02
6,2012-01-03
7,2013-11-08
