# **Visual Weather Analysis** from Location Freiburg in Germany

---
### Imports

In [12]:
import numpy as np
import pandas as pd

import plotly_express as px

---
### Data Loading & Wrangling

In [13]:
df = pd.read_csv("../data/weather/Freiburg_im_Breisgau_Wetter.csv")
old_rows = df.shape[0]
df.head(2)

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,946684800,2000-01-01 00:00:00 +0000 UTC,3600,Freiburg im Breisgau,47.999008,7.842104,1.54,,0.68,-0.1,...,,,,,,93,804,Clouds,overcast clouds,04n
1,946688400,2000-01-01 01:00:00 +0000 UTC,3600,Freiburg im Breisgau,47.999008,7.842104,1.64,,0.78,1.64,...,,,,,,95,804,Clouds,overcast clouds,04n


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194326 entries, 0 to 194325
Data columns (total 28 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   dt                   194326 non-null  int64  
 1   dt_iso               194326 non-null  object 
 2   timezone             194326 non-null  int64  
 3   city_name            194326 non-null  object 
 4   lat                  194326 non-null  float64
 5   lon                  194326 non-null  float64
 6   temp                 194326 non-null  float64
 7   visibility           1174 non-null    float64
 8   dew_point            194326 non-null  float64
 9   feels_like           194326 non-null  float64
 10  temp_min             194326 non-null  float64
 11  temp_max             194326 non-null  float64
 12  pressure             194326 non-null  int64  
 13  sea_level            0 non-null       float64
 14  grnd_level           0 non-null       float64
 15  humidity         

In [15]:
df['visibility'].unique()

array([   nan,  9000., 10000.,   100.,  8000.,  6000.,  7000.,   600.,
         500.,  4500.,   200.,   400.,  5000.,  2500.])

In [16]:
df['visibility'].value_counts()

9000.0     965
10000.0    183
100.0        6
8000.0       6
6000.0       4
7000.0       3
600.0        1
500.0        1
4500.0       1
200.0        1
400.0        1
5000.0       1
2500.0       1
Name: visibility, dtype: int64

drop columns with to many nan values

In [17]:
df.isna().sum()

dt                          0
dt_iso                      0
timezone                    0
city_name                   0
lat                         0
lon                         0
temp                        0
visibility             193152
dew_point                   0
feels_like                  0
temp_min                    0
temp_max                    0
pressure                    0
sea_level              194326
grnd_level             194326
humidity                    0
wind_speed                  0
wind_deg                    0
wind_gust              174306
rain_1h                150740
rain_3h                194295
snow_1h                191781
snow_3h                194326
clouds_all                  0
weather_id                  0
weather_main                0
weather_description         0
weather_icon                0
dtype: int64

In [18]:
df = df.drop(columns=['visibility', 'sea_level', 'grnd_level', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h'])

In [19]:
df.isna().sum()

dt                     0
dt_iso                 0
timezone               0
city_name              0
lat                    0
lon                    0
temp                   0
dew_point              0
feels_like             0
temp_min               0
temp_max               0
pressure               0
humidity               0
wind_speed             0
wind_deg               0
clouds_all             0
weather_id             0
weather_main           0
weather_description    0
weather_icon           0
dtype: int64

drop / change rest of columns

In [20]:
df.columns

Index(['dt', 'dt_iso', 'timezone', 'city_name', 'lat', 'lon', 'temp',
       'dew_point', 'feels_like', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id',
       'weather_main', 'weather_description', 'weather_icon'],
      dtype='object')

In [22]:
df.drop(columns=['dt', 'timezone', 'city_name', 'lat', 'lon', 'weather_icon', 'temp_min', 'temp_max', 'weather_id'], inplace=True)

In [23]:
# remove UTC by date
df['dt_iso'] = df['dt_iso'].replace("\s*\+.*UTC", "", regex=True)
df.loc[:3, ['dt_iso']]

Unnamed: 0,dt_iso
0,2000-01-01 00:00:00
1,2000-01-01 01:00:00
2,2000-01-01 02:00:00
3,2000-01-01 03:00:00


create data time and set to index

In [24]:
#df[['date', 'time']] = df['dt_iso'].str.split(" ", 1, expand=True)
#df.drop(columns=['dt_iso'], inplace=True)
#df['time'] = pd.to_datetime(df['time'])
#df['date'] = pd.to_datetime(df['date'])

df.rename(columns={'dt_iso':'dt'}, inplace=True)
df['dt'] = pd.to_datetime(df['dt'])
df = df.set_index('dt', drop=False)

In [25]:
df.head()

Unnamed: 0_level_0,dt,temp,dew_point,feels_like,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-01-01 00:00:00,2000-01-01 00:00:00,1.54,0.68,-0.1,1026,94,1.57,168,93,Clouds,overcast clouds
2000-01-01 01:00:00,2000-01-01 01:00:00,1.64,0.78,1.64,1026,94,1.21,163,95,Clouds,overcast clouds
2000-01-01 02:00:00,2000-01-01 02:00:00,1.14,0.28,-0.62,1026,94,1.61,163,95,Clouds,overcast clouds
2000-01-01 03:00:00,2000-01-01 03:00:00,1.24,0.53,-0.57,1026,95,1.66,163,95,Clouds,overcast clouds
2000-01-01 04:00:00,2000-01-01 04:00:00,1.61,0.9,0.07,1026,95,1.51,159,99,Clouds,overcast clouds


downsampling

In [26]:
# downsamlping
# df.resample('2H').sum()

freq = '1D'

df_per_day = None
to_do = [('temp', True, ['min', 'max', 'mean', 'median']),
         ('dew_point', True, ['min', 'max', 'mean', 'median']),
         ('feels_like', True, ['min', 'max', 'mean', 'median']),
         ('pressure', True, ['min', 'max', 'mean', 'median']),
         ('humidity', True, ['min', 'max', 'mean', 'median']),
         ('wind_speed', True, ['min', 'max', 'mean', 'median']),
         ('wind_deg', True, ['min', 'max', 'mean', 'median']),
         ('clouds_all', True, ['min', 'max', 'mean', 'median']),
         ('weather_description', False, np.max),
         ('weather_main', False, np.max)]
for col_name, should_use_agg, funcs in to_do:
    if should_use_agg:
        new_df = df[col_name].resample(freq).agg(funcs).rename(columns={i:col_name+"_"+i for i in funcs}, inplace=False)
    else:
        new_df = df[col_name].groupby(pd.Grouper(freq=freq)).apply(funcs)

    if type(df_per_day) != pd.Series and type(df_per_day) != pd.DataFrame and df_per_day == None:
        df_per_day = new_df
    else:
        df_per_day = df_per_day.join(new_df, how='outer')


In [27]:
df_per_day.head()

Unnamed: 0_level_0,temp_min,temp_max,temp_mean,temp_median,dew_point_min,dew_point_max,dew_point_mean,dew_point_median,feels_like_min,feels_like_max,...,wind_deg_min,wind_deg_max,wind_deg_mean,wind_deg_median,clouds_all_min,clouds_all_max,clouds_all_mean,clouds_all_median,weather_description,weather_main
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,1.14,4.98,3.060833,3.58,0.28,4.25,2.37,2.9,-0.62,3.84,...,158,226,192.0,199.5,92,100,98.458333,100.0,overcast clouds,Rain
2000-01-02,1.81,7.11,3.48125,2.81,0.45,6.36,2.6725,2.31,-0.16,7.11,...,79,189,120.583333,115.0,2,100,62.375,82.0,sky is clear,Rain
2000-01-03,-1.72,3.64,1.55,2.15,-2.98,1.82,0.105417,0.73,-4.63,3.64,...,135,188,152.25,150.5,3,75,33.375,36.5,sky is clear,Clouds
2000-01-04,-1.56,9.33,5.344167,6.655,-2.95,5.77,3.078333,4.7,-4.3,7.91,...,160,220,186.458333,181.0,5,100,59.75,60.5,sky is clear,Rain
2000-01-05,1.09,11.04,6.444583,6.535,0.67,9.95,5.553333,5.53,-1.78,10.63,...,114,232,176.958333,179.0,52,100,90.958333,100.0,overcast clouds,Rain


---
### Start Exploring

In [28]:
fig = px.line(df_per_day, x=df_per_day.index, y='temp_mean', title="Mean Temperature")
fig.show()

In [29]:
fig = px.line(df_per_day, x=df_per_day.index, y='temp_mean', title="Mean Temperature and Humidity")
fig.add_scatter(x=df_per_day.index, y=df_per_day['humidity_mean'], mode='lines', cliponaxis=False)
fig.show()

In [62]:
fig = px.scatter(df_per_day, x=df_per_day.index, y='temp_mean', template = 'plotly_dark',
                 color_discrete_sequence=px.colors.qualitative.T10,
                 trendline='ols', title='Mean Temperature over the years')
fig.update_traces(mode = 'lines')
fig.update_layout(xaxis_title='Years (in days)',
                 yaxis_title='Mean Temp (in Celsius)')
fig.data[-1].line.color = 'red'
fig.show()

In [100]:
features = [i for i in df_per_day.columns if 'mean' in i]
to_remove = ['feels_like_mean', 'wind_deg_mean', 'clouds_all_mean']
features = [i for i in features if i not in to_remove]
px.scatter_matrix(df_per_day, dimensions=features, color='weather_main')

In [64]:
# df.query('year == 2007')
df_pd_2007 = df_per_day[df_per_day.index.year == 2007]
fig = px.scatter(df_pd_2007, x=df_pd_2007.index, y='humidity_mean', color='weather_main', size='clouds_all_mean', template='plotly_white', 
                                            title='Humidity by Weather Situations and Cloudiness by size')
fig.update_layout(xaxis_title='Days (in 2007)',
                 yaxis_title='Mean of Humidity per day (as perecentage)',
                 legend_title="Weather Situations")
fig.show()

In [52]:
[i for i in df_per_day.columns if 'mean' in i or not 'median' in i and not 'max' in i and not 'min' in i]

['temp_mean',
 'dew_point_mean',
 'feels_like_mean',
 'pressure_mean',
 'humidity_mean',
 'wind_speed_mean',
 'wind_deg_mean',
 'clouds_all_mean',
 'weather_description',
 'weather_main']

In [99]:
mask = df_per_day.index.year >= 2015
df_pd_2015 = df_per_day[mask]

fig = px.scatter_polar(df_pd_2015, r="wind_speed_mean", theta="wind_deg_mean",
                       color="clouds_all_mean", symbol="weather_main",
                       color_discrete_sequence=px.colors.sequential.Plasma_r,
                       template='seaborn')

fig.update_layout(coloraxis_colorbar=dict(yanchor="top", y=1, x=0,
                                          ticks="outside",
                                          ticksuffix=" %",
                                          title='cloudiness'),
                title='Wind Direction and Weather Situation since 2015',
                legend_title='Weather Situation')
fig.show()

In [109]:
mask = df_per_day.index.year > 2007
mask = df_per_day.index.year > 1999
filtered_df = df_per_day[mask]
fig = px.scatter(filtered_df, x='temp_mean', y='wind_speed_mean', color='weather_main', size='clouds_all_mean', template='plotly_white', 
                                            title='Wind Spped and Temperature')
fig.update_layout(xaxis_title='Temperature (in celsius)',
                 yaxis_title='Wind Speed',
                 legend_title="Weather Situations")

In [119]:
# df.query('year == 2007')
#df_pd_2007 = df_per_day[df_per_day.index.year == 2007]
fig = px.scatter(df_per_day, x='clouds_all_mean', y='humidity_mean', color='weather_main', template='plotly_white', 
                                            title='Humidity by Weather Situations and Cloudiness by size',
                                            animation_frame=df_per_day.index.year, facet_col='weather_main')
fig.update_layout(xaxis_title='Days (in 2007)',
                 yaxis_title='Mean of Humidity per day (as perecentage)',
                 legend_title="Weather Situations")
fig.show()