# Data preprocessing

## Check the dataset

Import the packages

In [49]:
import pandas as pd
import numpy as np

In [50]:
df=pd.read_csv('../OriginalDataset/data_weather.csv')

In [51]:
df.head(3)

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,19790101,2.0,7.0,52.0,2.3,-4.1,-7.5,0.4,101900.0,9.0
1,19790102,6.0,1.7,27.0,1.6,-2.6,-7.5,0.0,102530.0,8.0
2,19790103,5.0,0.0,13.0,1.3,-2.8,-7.2,0.0,102050.0,4.0


In [52]:
print(df.dtypes)

date                  int64
cloud_cover         float64
sunshine            float64
global_radiation    float64
max_temp            float64
mean_temp           float64
min_temp            float64
precipitation       float64
pressure            float64
snow_depth          float64
dtype: object


Find the number of null values for each feature

In [53]:
null_values=df.isnull().sum()
print(null_values)

date                   0
cloud_cover           19
sunshine               0
global_radiation      19
max_temp               6
mean_temp             36
min_temp               2
precipitation          6
pressure               4
snow_depth          1441
dtype: int64


Convert the 'date' to date type

In [54]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['date'].head(3)

0   1979-01-01
1   1979-01-02
2   1979-01-03
Name: date, dtype: datetime64[ns]

## Data imputation

### Impute cloud_cover

In [60]:
# Impute NaN in cloud_cover with linear method
df['cloud_cover'] = df['cloud_cover'].interpolate(method='linear', limit_direction='both')
df['cloud_cover'] = df['cloud_cover'].round(1)

print(df['cloud_cover'].isnull().sum())

0


### Impute global_radiation

In [61]:
# Impute NaN in global_radiation with linear method
df['global_radiation'] = df['global_radiation'].interpolate(method='linear', limit_direction='both')
df['global_radiation'] = df['global_radiation'].round(1)

print(df['global_radiation'].isnull().sum())


0


### Impute precipitation

In [62]:
# Impute NaN in precipitation with linear method
df['precipitation'] = df['precipitation'].interpolate(method='linear', limit_direction='both')
df['precipitation'] = df['precipitation'].round(1)

print(df['precipitation'].isnull().sum())

0


### Impute pressure

In [76]:
# Impute NaN in pressure with linear method
df['pressure'] = df['pressure'].interpolate(method='linear', limit_direction='both')
df['pressure'] = df['pressure'].round(1)

print(df['pressure'].isnull().sum())

0


### Impute temperature

Impute min_temp

In [63]:
# Impute NaN in global_radiation with linear method
df['min_temp'] = df['min_temp'].interpolate(method='linear', limit_direction='both')
df['min_temp'] = df['min_temp'].round(1)

print(df['min_temp'].isnull().sum())

0


Impute max_temp

In [64]:
# Impute NaN in global_radiation with linear method
df['max_temp'] = df['max_temp'].interpolate(method='linear', limit_direction='both')
df['max_temp'] = df['max_temp'].round(1)

print(df['max_temp'].isnull().sum())

0


Impute mean_temp

In [78]:
# Find the rows whose value of mean_temp is NaN
null_rows = df[df['mean_temp'].isnull()]

# Calculate the mean value of min_temp and max_temp
for index, row in null_rows.iterrows():
    min_temp = row['min_temp']
    max_temp = row['max_temp']
    mean_temp = np.mean([min_temp, max_temp])
    df.at[index, 'mean_temp'] = mean_temp

df['mean_temp'] = df['mean_temp'].round(1)
print(df['mean_temp'].isnull().sum())

0


### Impute snow_depth