# 2.2 Data Cleaning for Keras - SCALED data

## Contents
### 1. Import libraries and data sets
### 2. Data consistency checks
### 3. Handling missing values

## 1. Import libraries and data sets

In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter

In [2]:
# path to project folder
path = r'/Users/susanwang/Documents/CF_ML/ML_Project'

### Weather data set (scaled)

In [3]:
weather_df = pd.read_csv(os.path.join(path, 'Data Sets', 'weather_dataset_scaled.csv'), index_col=False)

In [4]:
weather_df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,0.660514,-0.02793,0.826097,-0.001949,-1.101066,-0.265148,-0.179228,-0.902918,...,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,-0.024706,0.372147,-0.668215,-0.519743,-0.752237
1,19600102,1,0.244897,-0.02793,0.73576,-0.001949,-1.058108,1.65876,-0.179228,-0.810126,...,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.024706,-0.829285,-0.548046,-0.629054,-0.407141
2,19600103,1,1.07613,-0.02793,1.277781,-0.001949,-1.25142,0.155707,-0.179228,-1.065304,...,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-0.024706,-1.0095,-0.067372,0.054135,-0.177078
3,19600104,1,-1.001953,-0.02793,1.458455,-0.001949,-0.821838,-0.445514,-0.179228,-0.114186,...,0.783085,0.480538,0.387574,-1.183432,0.669056,-0.024706,-1.039536,-0.998679,-0.164486,-0.838511
4,19600105,1,0.244897,-0.02793,1.729466,-0.001949,-0.746661,-0.164944,-0.179228,0.187388,...,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,-0.024706,0.672505,-1.509396,-1.339569,-1.471186


### Answers data set (pleasant weather)

In [5]:
answers_df = pd.read_csv(os.path.join(path, 'Data Sets', 'Dataset-Answers-Weather_Prediction_Pleasant_Weather.csv'), index_col=False)

In [6]:
answers_df.head()

Unnamed: 0,DATE,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,19600101,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,19600102,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,19600103,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,19600104,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,19600105,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 2. Data consistency checks

In [7]:
# drop date and month columns from weather_df
weather_df.drop(columns=['DATE', 'MONTH'], inplace=True)
weather_df.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0.660514,-0.02793,0.826097,-0.001949,-1.101066,-0.265148,-0.179228,-0.902918,-0.528623,-0.845652,...,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,-0.024706,0.372147,-0.668215,-0.519743,-0.752237
1,0.244897,-0.02793,0.73576,-0.001949,-1.058108,1.65876,-0.179228,-0.810126,-0.582946,-0.46245,...,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.024706,-0.829285,-0.548046,-0.629054,-0.407141
2,1.07613,-0.02793,1.277781,-0.001949,-1.25142,0.155707,-0.179228,-1.065304,-0.25701,-0.186545,...,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-0.024706,-1.0095,-0.067372,0.054135,-0.177078
3,-1.001953,-0.02793,1.458455,-0.001949,-0.821838,-0.445514,-0.179228,-0.114186,-0.555784,-0.38581,...,0.783085,0.480538,0.387574,-1.183432,0.669056,-0.024706,-1.039536,-0.998679,-0.164486,-0.838511
4,0.244897,-0.02793,1.729466,-0.001949,-0.746661,-0.164944,-0.179228,0.187388,-1.003946,-1.075573,...,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,-0.024706,0.672505,-1.509396,-1.339569,-1.471186


In [8]:
# drop date column from answers_df
answers_df.drop(columns=['DATE'], inplace=True)
answers_df.head()

Unnamed: 0,BASEL_pleasant_weather,BELGRADE_pleasant_weather,BUDAPEST_pleasant_weather,DEBILT_pleasant_weather,DUSSELDORF_pleasant_weather,HEATHROW_pleasant_weather,KASSEL_pleasant_weather,LJUBLJANA_pleasant_weather,MAASTRICHT_pleasant_weather,MADRID_pleasant_weather,MUNCHENB_pleasant_weather,OSLO_pleasant_weather,SONNBLICK_pleasant_weather,STOCKHOLM_pleasant_weather,VALENTIA_pleasant_weather
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Remove columns of the three stations not in answers_df: GDANSK, ROMA, and TOURS

In [9]:
columns = weather_df.columns.tolist()

In [10]:
len(columns)

168

In [11]:
to_remove = ['GDANSK_cloud_cover',
 'GDANSK_humidity',
 'GDANSK_precipitation',
 'GDANSK_snow_depth',
 'GDANSK_temp_mean',
 'GDANSK_temp_min',
 'GDANSK_temp_max',
'ROMA_cloud_cover',
 'ROMA_wind_speed',
 'ROMA_humidity',
 'ROMA_pressure',
 'ROMA_sunshine',
 'ROMA_temp_mean',
              'TOURS_wind_speed',
 'TOURS_humidity',
 'TOURS_pressure',
 'TOURS_global_radiation',
 'TOURS_precipitation',
 'TOURS_temp_mean',
 'TOURS_temp_min',
            'TOURS_temp_max']

In [12]:
for col in to_remove:
    columns.remove(col)

In [13]:
len(columns)

147

In [14]:
df2 = weather_df[columns].copy()

In [15]:
df2.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0.660514,-0.02793,0.826097,-0.001949,-1.101066,-0.265148,-0.179228,-0.902918,-0.528623,-0.845652,...,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,-0.024706,0.372147,-0.668215,-0.519743,-0.752237
1,0.244897,-0.02793,0.73576,-0.001949,-1.058108,1.65876,-0.179228,-0.810126,-0.582946,-0.46245,...,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.024706,-0.829285,-0.548046,-0.629054,-0.407141
2,1.07613,-0.02793,1.277781,-0.001949,-1.25142,0.155707,-0.179228,-1.065304,-0.25701,-0.186545,...,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-0.024706,-1.0095,-0.067372,0.054135,-0.177078
3,-1.001953,-0.02793,1.458455,-0.001949,-0.821838,-0.445514,-0.179228,-0.114186,-0.555784,-0.38581,...,0.783085,0.480538,0.387574,-1.183432,0.669056,-0.024706,-1.039536,-0.998679,-0.164486,-0.838511
4,0.244897,-0.02793,1.729466,-0.001949,-0.746661,-0.164944,-0.179228,0.187388,-1.003946,-1.075573,...,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,-0.024706,0.672505,-1.509396,-1.339569,-1.471186


## 3. Handling missing values

In [16]:
pd.set_option('display.max_rows', 200)

In [17]:
df2.isna().sum()

BASEL_cloud_cover              0
BASEL_wind_speed               0
BASEL_humidity                 0
BASEL_pressure                 0
BASEL_global_radiation         0
BASEL_precipitation            0
BASEL_snow_depth               0
BASEL_sunshine                 0
BASEL_temp_mean                0
BASEL_temp_min                 0
BASEL_temp_max                 0
BELGRADE_cloud_cover           0
BELGRADE_humidity              0
BELGRADE_pressure              0
BELGRADE_global_radiation      0
BELGRADE_precipitation         0
BELGRADE_sunshine              0
BELGRADE_temp_mean             0
BELGRADE_temp_min              0
BELGRADE_temp_max              0
BUDAPEST_cloud_cover           0
BUDAPEST_humidity              0
BUDAPEST_pressure              0
BUDAPEST_global_radiation      0
BUDAPEST_precipitation         0
BUDAPEST_sunshine              0
BUDAPEST_temp_mean             0
BUDAPEST_temp_min              0
BUDAPEST_temp_max              0
DEBILT_cloud_cover             0
DEBILT_win

In [18]:
answers_df.isna().sum()

BASEL_pleasant_weather         0
BELGRADE_pleasant_weather      0
BUDAPEST_pleasant_weather      0
DEBILT_pleasant_weather        0
DUSSELDORF_pleasant_weather    0
HEATHROW_pleasant_weather      0
KASSEL_pleasant_weather        0
LJUBLJANA_pleasant_weather     0
MAASTRICHT_pleasant_weather    0
MADRID_pleasant_weather        0
MUNCHENB_pleasant_weather      0
OSLO_pleasant_weather          0
SONNBLICK_pleasant_weather     0
STOCKHOLM_pleasant_weather     0
VALENTIA_pleasant_weather      0
dtype: int64

In [19]:
# view counts of types of observations to see which observations are missing from which stations
obs = list()

for col in columns:
    obs.append(col.split('_', 1)[1])
obs

['cloud_cover',
 'wind_speed',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'snow_depth',
 'sunshine',
 'temp_mean',
 'temp_min',
 'temp_max',
 'cloud_cover',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'sunshine',
 'temp_mean',
 'temp_min',
 'temp_max',
 'cloud_cover',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'sunshine',
 'temp_mean',
 'temp_min',
 'temp_max',
 'cloud_cover',
 'wind_speed',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'sunshine',
 'temp_mean',
 'temp_min',
 'temp_max',
 'cloud_cover',
 'wind_speed',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'snow_depth',
 'sunshine',
 'temp_mean',
 'temp_min',
 'temp_max',
 'cloud_cover',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'snow_depth',
 'sunshine',
 'temp_mean',
 'temp_min',
 'temp_max',
 'wind_speed',
 'humidity',
 'pressure',
 'global_radiation',
 'precipitation',
 'sunshine',
 'temp_mean',

In [20]:
Counter(obs)

Counter({'global_radiation': 15,
         'precipitation': 15,
         'sunshine': 15,
         'temp_mean': 15,
         'temp_min': 15,
         'temp_max': 15,
         'cloud_cover': 14,
         'humidity': 14,
         'pressure': 14,
         'wind_speed': 9,
         'snow_depth': 6})

In [21]:
# wind speed and snow depth are missing in many weather stations, these will be removed.
# three stations are missing cloud_cover, humidity, and pressure. These will be filled in from nearby stations.

In [22]:
# remove columns on wind_speed and snow_depth
for col in columns:
    if col.split('_', 1)[1] in ['wind_speed', 'snow_depth']:
        columns.remove(col)

columns

['BASEL_cloud_cover',
 'BASEL_humidity',
 'BASEL_pressure',
 'BASEL_global_radiation',
 'BASEL_precipitation',
 'BASEL_sunshine',
 'BASEL_temp_mean',
 'BASEL_temp_min',
 'BASEL_temp_max',
 'BELGRADE_cloud_cover',
 'BELGRADE_humidity',
 'BELGRADE_pressure',
 'BELGRADE_global_radiation',
 'BELGRADE_precipitation',
 'BELGRADE_sunshine',
 'BELGRADE_temp_mean',
 'BELGRADE_temp_min',
 'BELGRADE_temp_max',
 'BUDAPEST_cloud_cover',
 'BUDAPEST_humidity',
 'BUDAPEST_pressure',
 'BUDAPEST_global_radiation',
 'BUDAPEST_precipitation',
 'BUDAPEST_sunshine',
 'BUDAPEST_temp_mean',
 'BUDAPEST_temp_min',
 'BUDAPEST_temp_max',
 'DEBILT_cloud_cover',
 'DEBILT_humidity',
 'DEBILT_pressure',
 'DEBILT_global_radiation',
 'DEBILT_precipitation',
 'DEBILT_sunshine',
 'DEBILT_temp_mean',
 'DEBILT_temp_min',
 'DEBILT_temp_max',
 'DUSSELDORF_cloud_cover',
 'DUSSELDORF_humidity',
 'DUSSELDORF_pressure',
 'DUSSELDORF_global_radiation',
 'DUSSELDORF_precipitation',
 'DUSSELDORF_sunshine',
 'DUSSELDORF_temp_mean',


In [23]:
len(columns)

132

In [24]:
# new df with without those observations
df3 = df2[columns].copy()

In [25]:
df3.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0.660514,0.826097,-0.001949,-1.101066,-0.265148,-0.902918,-0.528623,-0.845652,-0.478356,-1.206433,...,-0.639538,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,0.372147,-0.668215,-0.519743,-0.752237
1,0.244897,0.73576,-0.001949,-1.058108,1.65876,-0.810126,-0.582946,-0.46245,-0.569988,0.652846,...,-0.62855,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.829285,-0.548046,-0.629054,-0.407141
2,1.07613,1.277781,-0.001949,-1.25142,0.155707,-1.065304,-0.25701,-0.186545,-0.592896,0.652846,...,-0.727444,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-1.0095,-0.067372,0.054135,-0.177078
3,-1.001953,1.458455,-0.001949,-0.821838,-0.445514,-0.114186,-0.555784,-0.38581,-0.512718,1.396557,...,-0.925231,0.783085,0.480538,0.387574,-1.183432,0.669056,-1.039536,-0.998679,-0.164486,-0.838511
4,0.244897,1.729466,-0.001949,-0.746661,-0.164944,0.187388,-1.003946,-1.075573,-1.039603,1.396557,...,-0.705467,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,0.672505,-1.509396,-1.339569,-1.471186


In [26]:
df3.shape

(22950, 132)

In [27]:
for col in columns:
    observation = col.split('_', 1)[1]
    if observation in ['cloud_cover', 'humidity', 'pressure']:
        print(col)

BASEL_cloud_cover
BASEL_humidity
BASEL_pressure
BELGRADE_cloud_cover
BELGRADE_humidity
BELGRADE_pressure
BUDAPEST_cloud_cover
BUDAPEST_humidity
BUDAPEST_pressure
DEBILT_cloud_cover
DEBILT_humidity
DEBILT_pressure
DUSSELDORF_cloud_cover
DUSSELDORF_humidity
DUSSELDORF_pressure
HEATHROW_cloud_cover
HEATHROW_humidity
HEATHROW_pressure
KASSEL_humidity
KASSEL_pressure
LJUBLJANA_cloud_cover
LJUBLJANA_humidity
LJUBLJANA_pressure
MAASTRICHT_cloud_cover
MAASTRICHT_humidity
MAASTRICHT_pressure
MADRID_cloud_cover
MADRID_humidity
MADRID_pressure
MUNCHENB_cloud_cover
MUNCHENB_humidity
OSLO_cloud_cover
OSLO_humidity
OSLO_pressure
SONNBLICK_cloud_cover
SONNBLICK_humidity
SONNBLICK_pressure
STOCKHOLM_cloud_cover
STOCKHOLM_pressure
VALENTIA_cloud_cover
VALENTIA_humidity
VALENTIA_pressure


In [28]:
# Kassel is missing cloud_cover. MunchenB is missing pressure. Stockholm is missing humidity.
# Fill in missing information with nearby stations: Ljubljana, Sonnblick, Oslo

In [29]:
df3['KASSEL_cloud_cover'] = df3['LJUBLJANA_cloud_cover']
df3['MUNCHENB_pressure'] = df3['SONNBLICK_pressure']
df3['STOCKHOLM_humidity'] = df3['OSLO_humidity']

In [30]:
df3.head()

Unnamed: 0,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,...,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max,KASSEL_cloud_cover,MUNCHENB_pressure,STOCKHOLM_humidity
0,0.660514,0.826097,-0.001949,-1.101066,-0.265148,-0.902918,-0.528623,-0.845652,-0.478356,-1.206433,...,-1.299744,-0.806427,-0.088407,0.372147,-0.668215,-0.519743,-0.752237,1.205492,0.095234,1.544023
1,0.244897,0.73576,-0.001949,-1.058108,1.65876,-0.810126,-0.582946,-0.46245,-0.569988,0.652846,...,-1.262455,-1.042055,0.503361,-0.829285,-0.548046,-0.629054,-0.407141,0.371461,0.067319,-0.712374
2,1.07613,1.277781,-0.001949,-1.25142,0.155707,-1.065304,-0.25701,-0.186545,-0.592896,0.652846,...,-0.432779,-1.136306,-0.396127,-1.0095,-0.067372,0.054135,-0.177078,1.205492,0.132454,-0.27363
3,-1.001953,1.458455,-0.001949,-0.821838,-0.445514,-0.114186,-0.555784,-0.38581,-0.512718,1.396557,...,0.387574,-1.183432,0.669056,-1.039536,-0.998679,-0.164486,-0.838511,0.371461,0.418586,1.544023
4,0.244897,1.729466,-0.001949,-0.746661,-0.164944,0.187388,-1.003946,-1.075573,-1.039603,1.396557,...,1.72997,-0.794645,-0.49081,0.672505,-1.509396,-1.339569,-1.471186,0.788477,0.388345,1.418668


In [31]:
df3.shape

(22950, 135)

In [32]:
answers_df.shape

(22950, 15)

## Export cleaned data

In [33]:
df3.to_csv(os.path.join(path, 'Data Sets', 'scaled_cleaned_weather.csv'), index=False)