# Lab 1

## Imports

In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import altair as alt

In [15]:
precipit_df = pd.read_csv("data/van_weather_1990-01-01_2023-11-06.csv").drop(columns = ['sunrise', 'sunset'])
precipit_df.shape

(12363, 17)

### Column description
- `date`: date of the record
- `weather_code`: The most severe weather condition on a given day
- `temperature_2m_max`: Maximum daily air temperature at 2 meters above ground (°C)
- `temperature_2m_min`: Minimum daily air temperature at 2 meters above ground (°C)
- `temperature_2m_mean`: Mean daily air temperature at 2 meters above ground (°C)
- `apparent_temperature_max`: Maximum daily apparent temperature (°C)
- `apparent_temperature_min`: Minimum daily apparent temperature (°C)
- `apparent_temperature_mean`: Mean daily apparent temperature (°C)
- `sunrise`: Sun rise time (need to remove)
- `sunset`: Sun set time (need to remove)
- `precipitation_sum`: Sum of daily precipitation (including rain, showers and snowfall) (mm)
- `rain_sum`: Sum of daily rain (mm)
- `snowfall_sum`: Sum of daily snowfall (mm)
- `precipitation_hours`: The number of hours with rain (mm)
- `wind_speed_10m_max`: Maximum wind speed on a day (km/h)
- `wind_gusts_10m_max`: Maximum wind gusts on a day (km/h)
- `wind_direction_10m_dominant`: Dominant wind direction (°)
- `shortwave_radiation_sum`: The sum of solar radiaion on a given day in Megajoules (MJ/m²)
- `et0_fao_evapotranspiration`: Daily sum of ET₀ Reference Evapotranspiration of a well watered grass field (mm)

In [16]:
train_df, test_df = train_test_split(precipit_df, test_size=0.2, random_state=522)
train_df.shape

(9890, 17)

In [17]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9890 entries, 4428 to 12180
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         9890 non-null   object 
 1   weather_code                 9890 non-null   float64
 2   temperature_2m_max           9890 non-null   float64
 3   temperature_2m_min           9890 non-null   float64
 4   temperature_2m_mean          9890 non-null   float64
 5   apparent_temperature_max     9890 non-null   float64
 6   apparent_temperature_min     9890 non-null   float64
 7   apparent_temperature_mean    9890 non-null   float64
 8   precipitation_sum            9890 non-null   float64
 9   rain_sum                     9890 non-null   float64
 10  snowfall_sum                 9890 non-null   float64
 11  precipitation_hours          9890 non-null   float64
 12  wind_speed_10m_max           9890 non-null   float64
 13  wind_gusts_10m_max 

In [18]:
train_df

Unnamed: 0,date,weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
4428,2002-02-15,1.0,9.609500,-0.540500,3.924084,7.568454,-3.726679,1.283807,0.000000,0.000000,0.00,0.0,7.594207,14.759999,7.625911,10.56,1.169531
1182,1993-03-28,3.0,11.409499,0.859500,6.765750,9.362197,-2.034344,4.563612,0.000000,0.000000,0.00,0.0,7.636753,20.880001,144.196550,18.07,2.278618
12221,2023-06-18,53.0,14.380500,10.680500,12.353417,12.619253,7.224102,9.728248,1.200000,1.200000,0.00,5.0,21.398056,37.079998,272.093350,26.07,3.796273
4197,2001-06-29,1.0,18.709501,9.159499,14.947000,19.335688,7.927686,14.373271,0.000000,0.000000,0.00,0.0,8.311245,21.960000,235.980600,31.24,4.915466
7352,2010-02-17,2.0,10.409499,1.059500,5.453250,7.388372,-2.386003,2.310742,0.000000,0.000000,0.00,0.0,12.758432,27.720000,349.729030,11.33,1.357648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8346,2012-11-07,53.0,8.609500,5.359500,6.917832,6.072889,2.397108,4.108495,1.500000,1.500000,0.00,4.0,14.982228,29.160000,177.179060,6.29,0.776031
4426,2002-02-13,3.0,5.659500,1.159500,3.584500,2.348793,-1.580687,0.606601,0.000000,0.000000,0.00,0.0,10.883676,22.319998,183.002850,7.37,0.797646
3360,1999-03-15,73.0,4.709500,1.259500,2.744917,1.333366,-2.197767,-0.306053,6.000000,4.800000,0.84,10.0,10.137692,21.960000,134.169750,7.78,0.952280
1899,1995-03-15,55.0,9.859500,3.709500,6.849083,6.388447,1.092325,3.629564,6.399999,6.399999,0.00,14.0,15.716793,33.120000,175.292250,10.26,1.304759


In [19]:
alt.data_transformers.enable("vegafusion")

numeric_cols = train_df.select_dtypes(include=['number']).columns.tolist()

numeric_cols_hists = alt.Chart(train_df).mark_bar().encode(
    alt.X(alt.repeat()).type('quantitative').bin(maxbins=40),
        y='count()',
    ).properties(
        height=100,
        width=200
    ).repeat(
        numeric_cols,
        columns=4
    )

# Show the plot
numeric_cols_hists

In [20]:
train_df[numeric_cols].corr('spearman').style.background_gradient(cmap="gist_yarg")

Unnamed: 0,weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
weather_code,1.0,-0.468086,-0.254281,-0.381574,-0.473742,-0.290097,-0.39971,0.915229,0.882139,0.411835,0.890523,0.528646,0.503202,-0.294143,-0.62759,-0.592634
temperature_2m_max,-0.468086,1.0,0.920242,0.983005,0.993839,0.929724,0.980965,-0.407699,-0.371515,-0.367479,-0.420198,-0.258157,-0.138606,0.403543,0.786627,0.85683
temperature_2m_min,-0.254281,0.920242,1.0,0.969825,0.917807,0.992868,0.963089,-0.185242,-0.148832,-0.344227,-0.194671,-0.11359,0.001677,0.342697,0.59703,0.693309
temperature_2m_mean,-0.381574,0.983005,0.969825,1.0,0.979155,0.973975,0.995488,-0.315611,-0.278985,-0.364368,-0.32679,-0.196726,-0.074092,0.398733,0.721798,0.803912
apparent_temperature_max,-0.473742,0.993839,0.917807,0.979155,1.0,0.932888,0.985261,-0.415765,-0.379452,-0.372051,-0.425856,-0.308654,-0.194735,0.394857,0.778129,0.842158
apparent_temperature_min,-0.290097,0.929724,0.992868,0.973975,0.932888,1.0,0.975583,-0.227715,-0.190693,-0.35782,-0.235249,-0.185976,-0.066475,0.363255,0.619747,0.70707
apparent_temperature_mean,-0.39971,0.980965,0.963089,0.995488,0.985261,0.975583,1.0,-0.338296,-0.301414,-0.371942,-0.347147,-0.260338,-0.14019,0.398931,0.721855,0.796812
precipitation_sum,0.915229,-0.407699,-0.185242,-0.315611,-0.415765,-0.227715,-0.338296,1.0,0.987924,0.234016,0.982372,0.564507,0.542141,-0.265462,-0.621872,-0.583486
rain_sum,0.882139,-0.371515,-0.148832,-0.278985,-0.379452,-0.190693,-0.301414,0.987924,1.0,0.136321,0.96948,0.549402,0.530319,-0.244963,-0.59953,-0.563467
snowfall_sum,0.411835,-0.367479,-0.344227,-0.364368,-0.372051,-0.35782,-0.371942,0.234016,0.136321,1.0,0.239328,0.187032,0.157185,-0.172156,-0.257012,-0.25352


Since temperatures are highly correlated, we can just use one temperature parameter in our analysis. Here I chose to use `temperature_2m_mean`.