# 1.2 Cleaning and Scaling Data Set

## Contents
### 1. Import libraries and data set
### 2. Understand data set
### 3. Clean data set
### 4. Scaling
### 5. Export scaled data set

## 1. Import libraries and data set

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import os
import sklearn
from sklearn.preprocessing import StandardScaler

In [2]:
# create path to project folder
path = r'/Users/susanwang/Documents/CF_ML/ML_Project'

In [3]:
# open data set
df = pd.read_csv(os.path.join(path, 'Data Sets', 'Dataset-weather-prediction-dataset-processed.csv'), index_col=False)

### 2. Understand data set

In [4]:
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [5]:
df.tail()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
22945,20221027,10,1,2.1,0.79,1.0248,1.34,0.22,0,7.7,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22946,20221028,10,6,2.1,0.77,1.0244,1.34,0.22,0,5.4,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22947,20221029,10,4,2.1,0.76,1.0227,1.34,0.22,0,6.1,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22948,20221030,10,5,2.1,0.8,1.0212,1.34,0.22,0,5.8,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5
22949,20221031,10,5,2.1,0.84,1.0193,1.34,0.22,0,3.2,...,5,0.82,1.0142,1.13,0.41,0,3.4,10.7,7.9,13.5


In [6]:
df.columns

Index(['DATE', 'MONTH', 'BASEL_cloud_cover', 'BASEL_wind_speed',
       'BASEL_humidity', 'BASEL_pressure', 'BASEL_global_radiation',
       'BASEL_precipitation', 'BASEL_snow_depth', 'BASEL_sunshine',
       ...
       'VALENTIA_cloud_cover', 'VALENTIA_humidity', 'VALENTIA_pressure',
       'VALENTIA_global_radiation', 'VALENTIA_precipitation',
       'VALENTIA_snow_depth', 'VALENTIA_sunshine', 'VALENTIA_temp_mean',
       'VALENTIA_temp_min', 'VALENTIA_temp_max'],
      dtype='object', length=170)

170 columns, presumably the weather variables for the 18 different weather stations in Europe. Date ranges from 1960 to 2022.

In [7]:
df.shape

(22950, 170)

## 3. Clean data set

### Check for missing values

In [8]:
df.isnull().sum()

DATE                   0
MONTH                  0
BASEL_cloud_cover      0
BASEL_wind_speed       0
BASEL_humidity         0
                      ..
VALENTIA_snow_depth    0
VALENTIA_sunshine      0
VALENTIA_temp_mean     0
VALENTIA_temp_min      0
VALENTIA_temp_max      0
Length: 170, dtype: int64

In [9]:
# try to display all answers
pd.options.display.max_rows = 200
df.isnull().sum()

DATE                           0
MONTH                          0
BASEL_cloud_cover              0
BASEL_wind_speed               0
BASEL_humidity                 0
BASEL_pressure                 0
BASEL_global_radiation         0
BASEL_precipitation            0
BASEL_snow_depth               0
BASEL_sunshine                 0
BASEL_temp_mean                0
BASEL_temp_min                 0
BASEL_temp_max                 0
BELGRADE_cloud_cover           0
BELGRADE_humidity              0
BELGRADE_pressure              0
BELGRADE_global_radiation      0
BELGRADE_precipitation         0
BELGRADE_sunshine              0
BELGRADE_temp_mean             0
BELGRADE_temp_min              0
BELGRADE_temp_max              0
BUDAPEST_cloud_cover           0
BUDAPEST_humidity              0
BUDAPEST_pressure              0
BUDAPEST_global_radiation      0
BUDAPEST_precipitation         0
BUDAPEST_sunshine              0
BUDAPEST_temp_mean             0
BUDAPEST_temp_min              0
BUDAPEST_t

No missing values.

### Check for duplicates

In [10]:
df_dups = df[df.duplicated()]
df_dups

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max


No duplicates.

## Checking for outliers

In [12]:
pd.options.display.max_columns = None

In [13]:
df.describe()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,BASEL_temp_max,BELGRADE_cloud_cover,BELGRADE_humidity,BELGRADE_pressure,BELGRADE_global_radiation,BELGRADE_precipitation,BELGRADE_sunshine,BELGRADE_temp_mean,BELGRADE_temp_min,BELGRADE_temp_max,BUDAPEST_cloud_cover,BUDAPEST_humidity,BUDAPEST_pressure,BUDAPEST_global_radiation,BUDAPEST_precipitation,BUDAPEST_sunshine,BUDAPEST_temp_mean,BUDAPEST_temp_min,BUDAPEST_temp_max,DEBILT_cloud_cover,DEBILT_wind_speed,DEBILT_humidity,DEBILT_pressure,DEBILT_global_radiation,DEBILT_precipitation,DEBILT_sunshine,DEBILT_temp_mean,DEBILT_temp_min,DEBILT_temp_max,DUSSELDORF_cloud_cover,DUSSELDORF_wind_speed,DUSSELDORF_humidity,DUSSELDORF_pressure,DUSSELDORF_global_radiation,DUSSELDORF_precipitation,DUSSELDORF_snow_depth,DUSSELDORF_sunshine,DUSSELDORF_temp_mean,DUSSELDORF_temp_min,DUSSELDORF_temp_max,GDANSK_cloud_cover,GDANSK_humidity,GDANSK_precipitation,GDANSK_snow_depth,GDANSK_temp_mean,GDANSK_temp_min,GDANSK_temp_max,HEATHROW_cloud_cover,HEATHROW_humidity,HEATHROW_pressure,HEATHROW_global_radiation,HEATHROW_precipitation,HEATHROW_snow_depth,HEATHROW_sunshine,HEATHROW_temp_mean,HEATHROW_temp_min,HEATHROW_temp_max,KASSEL_wind_speed,KASSEL_humidity,KASSEL_pressure,KASSEL_global_radiation,KASSEL_precipitation,KASSEL_sunshine,KASSEL_temp_mean,KASSEL_temp_min,KASSEL_temp_max,LJUBLJANA_cloud_cover,LJUBLJANA_wind_speed,LJUBLJANA_humidity,LJUBLJANA_pressure,LJUBLJANA_global_radiation,LJUBLJANA_precipitation,LJUBLJANA_sunshine,LJUBLJANA_temp_mean,LJUBLJANA_temp_min,LJUBLJANA_temp_max,MAASTRICHT_cloud_cover,MAASTRICHT_wind_speed,MAASTRICHT_humidity,MAASTRICHT_pressure,MAASTRICHT_global_radiation,MAASTRICHT_precipitation,MAASTRICHT_sunshine,MAASTRICHT_temp_mean,MAASTRICHT_temp_min,MAASTRICHT_temp_max,MADRID_cloud_cover,MADRID_wind_speed,MADRID_humidity,MADRID_pressure,MADRID_global_radiation,MADRID_precipitation,MADRID_sunshine,MADRID_temp_mean,MADRID_temp_min,MADRID_temp_max,MUNCHENB_cloud_cover,MUNCHENB_humidity,MUNCHENB_global_radiation,MUNCHENB_precipitation,MUNCHENB_snow_depth,MUNCHENB_sunshine,MUNCHENB_temp_mean,MUNCHENB_temp_min,MUNCHENB_temp_max,OSLO_cloud_cover,OSLO_wind_speed,OSLO_humidity,OSLO_pressure,OSLO_global_radiation,OSLO_precipitation,OSLO_snow_depth,OSLO_sunshine,OSLO_temp_mean,OSLO_temp_min,OSLO_temp_max,ROMA_cloud_cover,ROMA_wind_speed,ROMA_humidity,ROMA_pressure,ROMA_sunshine,ROMA_temp_mean,SONNBLICK_cloud_cover,SONNBLICK_wind_speed,SONNBLICK_humidity,SONNBLICK_pressure,SONNBLICK_global_radiation,SONNBLICK_precipitation,SONNBLICK_sunshine,SONNBLICK_temp_mean,SONNBLICK_temp_min,SONNBLICK_temp_max,STOCKHOLM_cloud_cover,STOCKHOLM_pressure,STOCKHOLM_global_radiation,STOCKHOLM_precipitation,STOCKHOLM_sunshine,STOCKHOLM_temp_mean,STOCKHOLM_temp_min,STOCKHOLM_temp_max,TOURS_wind_speed,TOURS_humidity,TOURS_pressure,TOURS_global_radiation,TOURS_precipitation,TOURS_temp_mean,TOURS_temp_min,TOURS_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
count,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0,22950.0
mean,19909840.0,6.50963,5.410763,2.120462,0.758554,1.018013,1.345244,0.222305,0.359564,4.592222,10.392471,6.317015,15.076322,4.244357,0.683002,1.017073,1.57336,0.189349,5.77078,12.659935,8.521094,17.421516,4.516383,0.671107,1.017037,1.417872,0.149238,5.416889,11.743922,7.971168,16.225569,5.455948,3.412349,0.814856,1.015293,1.13802,0.22738,4.398658,10.022754,5.875198,13.999904,5.241786,3.918022,0.757502,1.016148,1.135907,0.207578,0.170109,4.186519,10.718671,6.626824,14.754052,4.793115,0.707963,0.129617,1.485054,8.053481,4.66315,11.665163,5.304096,0.771252,1.015389,1.182957,0.166267,0.043529,4.25044,11.209394,7.30573,15.110401,2.4002,0.771186,1.016101,1.143839,0.193428,4.026928,9.138636,5.411198,13.116192,5.109237,1.419752,0.75942,1.017367,1.355344,0.377832,5.046667,10.720702,6.366423,15.619508,5.410109,4.197538,0.800047,1.016009,1.171199,0.208844,4.28417,10.147098,6.205342,14.110597,3.69146,2.233839,0.585625,1.017486,1.894844,0.120644,7.71041,14.931952,10.052318,19.812536,5.227451,0.762837,1.374096,0.262379,1.208192,4.906118,9.01454,4.751996,13.613843,5.410588,2.733556,0.733657,1.011602,1.067922,0.218228,5.657211,4.692235,6.437512,3.061041,10.297634,3.090588,2.617346,0.731188,1.015203,7.110601,15.404431,5.101046,7.128392,0.841363,1.026306,1.568325,0.476219,4.720449,-5.049882,-7.311686,-2.767651,5.170719,1.011493,1.116964,0.149542,5.056972,7.343521,4.37498,10.720227,3.85051,0.765129,1.016905,1.544676,0.185527,11.640279,7.319233,15.962784,5.723355,0.825824,1.014242,1.13449,0.414698,0.00122,3.460989,10.724257,7.901904,13.515752
std,181383.3,3.443672,2.406115,0.732625,0.110699,0.006543,0.931158,0.498995,2.006231,4.310808,7.363575,6.524121,8.730759,2.689274,0.14029,0.007543,0.933023,0.509952,4.328599,8.811427,7.721044,10.031285,2.023171,0.10965,0.006254,0.874261,0.432604,4.030315,8.501127,7.478217,9.786618,2.163247,1.541611,0.098213,0.009812,0.892257,0.455877,4.083084,6.320359,5.790673,7.298353,2.237902,1.81754,0.115862,0.00853,0.877636,0.417016,1.295166,4.125114,6.790568,6.162739,7.92871,2.433263,0.305789,0.371517,5.52507,7.819785,7.431544,8.68736,2.034997,0.104123,0.010453,0.886817,0.381006,0.589519,3.992156,5.723807,5.322685,6.558114,1.188282,0.10851,0.008359,0.807578,0.383457,3.747028,6.772219,5.999503,7.96624,2.398047,0.832357,0.129023,0.007706,0.959996,0.929656,4.391513,8.258498,7.347372,9.579308,2.259805,1.927422,0.111872,0.009301,0.903829,0.440237,4.050447,6.665457,5.99298,7.760105,2.699564,1.574226,0.177192,0.00741,0.958658,0.382177,4.146869,7.41906,6.439893,8.626836,2.424283,0.124359,0.943718,0.573925,3.942367,4.468089,7.957285,7.221321,9.160389,2.329198,1.550512,0.15955,0.012082,0.962612,0.478747,12.034542,4.606306,8.240906,7.7821,9.261002,0.934629,1.448022,0.051115,0.002809,1.66699,5.939003,2.567581,3.962556,0.17693,0.042988,0.874569,0.717434,4.398295,6.837712,6.985092,6.827926,2.840534,0.026884,0.988751,0.355772,4.990732,8.038386,7.488771,9.100872,1.69488,0.173448,0.007986,1.381986,0.410645,6.374588,5.632276,7.595673,1.630313,0.071121,0.010727,0.848813,0.844943,0.049383,3.329432,3.328727,3.659393,3.477373
min,19600100.0,1.0,0.0,0.0,0.35,0.9747,0.01,0.0,-2.0,0.0,-18.5,-23.3,-14.0,0.0,0.23,0.9787,0.12,0.0,0.0,-17.3,-21.0,-14.3,0.0,0.24,0.9765,0.12,0.0,0.0,-16.5,-18.1,-13.2,0.0,0.0,0.31,0.9621,0.01,0.0,0.0,-13.2,-18.9,-10.6,0.0,0.2,0.26,0.9647,0.0,0.0,0.0,0.0,-14.6,-20.8,-10.9,0.0,0.0,0.0,0.0,-23.8,-27.4,-19.0,0.0,0.33,0.9596,0.02,0.0,0.0,0.0,-8.9,-13.2,-6.2,0.0,0.31,0.9664,0.12,0.0,0.0,-15.8,-19.9,-12.9,0.0,0.0,0.27,0.9728,0.0,0.0,0.0,-15.5,-20.3,-12.1,0.0,0.0,0.29,0.9661,0.06,0.0,0.0,-14.9,-19.3,-12.3,0.0,0.0,0.14,0.9842,0.12,0.0,0.0,-4.5,-9.2,-0.8,0.0,0.2,0.17,0.0,0.0,0.0,-22.5,-27.6,-17.8,0.0,0.0,0.19,0.9545,-0.1,0.0,0.0,0.0,-22.0,-24.9,-20.5,0.0,0.0,0.18,0.9829,0.0,-5.6,0.0,0.0,0.07,0.6698,-0.12,0.0,0.0,-31.8,-34.3,-31.2,-99.0,-0.099,0.02,0.0,-1.7,-23.9,-25.5,-22.1,0.0,0.0,0.9657,0.0,0.0,-12.6,-18.5,-10.7,0.0,0.38,0.9551,0.02,0.0,0.0,0.0,-3.5,-7.3,-1.5
25%,19750920.0,4.0,4.0,2.1,0.68,1.0158,0.54,0.0,0.0,0.4,4.7,1.4,8.3,2.0,0.58,1.0124,0.75,0.0,1.3,5.7,2.5,9.5,4.0,0.63,1.015,0.64,0.0,1.6,4.9,2.1,8.0,4.0,2.3,0.76,1.0094,0.37,0.0,0.5,5.5,1.8,8.6,4.0,2.5,0.69,1.0118,0.37,0.0,0.0,0.3,5.7,2.3,8.7,3.0,0.72,0.0,0.0,2.2,-0.1,4.7,4.0,0.7,1.0091,0.41,0.0,0.0,0.4,6.8,3.3,10.2,1.6,0.71,1.0117,0.43,0.0,0.5,4.6,1.6,7.5,3.0,0.9,0.66,1.0127,0.47,0.0,0.3,3.9,0.5,7.9,4.0,2.7,0.73,1.0104,0.39,0.0,0.5,5.3,2.0,8.3,1.0,1.1,0.44,1.0128,1.09,0.0,4.7,8.7,5.0,12.4,4.0,0.68,0.56,0.0,0.0,0.4,2.9,-0.3,6.5,4.0,1.7,0.62,1.0041,0.2,0.0,0.0,0.0,0.4,-2.1,2.8,3.0,1.8,0.73,1.0152,7.1,11.1,3.0,4.5,0.8,1.0263,0.85,0.0,0.0,-9.8,-12.0,-7.4,4.0,1.0051,0.21,0.0,0.1,1.3,-0.7,3.4,2.6,0.71,1.013,0.58,0.0,7.1,3.225,10.5,5.0,0.79,1.0094,0.4,0.01,0.0,0.5,8.8,6.1,11.3
50%,19910600.0,7.0,6.0,2.1,0.77,1.018,1.13,0.0,0.0,3.6,10.7,6.5,15.4,4.0,0.68,1.0166,1.57,0.0,5.7,13.4,9.2,18.5,4.0,0.67,1.017,1.41,0.0,5.4,11.8,8.0,16.4,6.0,3.1,0.83,1.0159,0.93,0.01,3.5,10.2,6.1,14.0,6.0,3.6,0.77,1.0161,0.91,0.01,0.0,3.1,10.9,6.9,14.8,5.0,0.82,0.0,0.0,8.0,4.8,11.6,6.0,0.78,1.0163,0.95,0.0,0.0,3.4,11.2,7.5,14.8,2.4,0.77,1.0161,1.14,0.04,4.0,9.1,5.4,13.1,6.0,1.3,0.77,1.0172,1.15,0.0,4.7,11.2,7.0,16.3,6.0,3.7,0.81,1.0164,0.97,0.01,3.3,10.3,6.5,14.2,4.0,1.9,0.57,1.0167,1.89,0.0,8.6,14.0,9.4,18.8,6.0,0.78,1.18,0.01,0.0,4.0,9.4,5.0,14.1,6.0,2.4,0.76,1.0121,0.8,0.0,0.0,4.0,6.4,3.0,10.2,3.0,2.6,0.73,1.0152,7.1,15.4,6.0,5.9,0.91,1.0315,1.46,0.12,4.0,-4.7,-6.8,-2.6,5.0,1.0121,0.85,0.0,3.9,7.0,4.2,10.2,3.6,0.8,1.0169,1.33,0.0,11.6,7.4,15.9,6.0,0.82,1.0142,0.97,0.28,0.0,3.4,10.7,7.9,13.5
75%,20070210.0,9.0,7.0,2.1,0.84,1.0201,2.07,0.22,0.0,7.9,16.2,11.5,21.9,7.0,0.79,1.0214,2.36,0.1,9.5,19.8,14.8,25.6,6.0,0.71,1.0183,2.05,0.08,8.4,18.6,14.1,24.3,7.0,4.1,0.89,1.0218,1.76,0.25,7.3,14.9,10.3,19.4,7.0,5.0,0.84,1.0211,1.77,0.24,0.0,7.1,15.9,11.4,20.7,7.0,0.89,0.08,0.0,14.6,10.5,18.8,7.0,0.85,1.0225,1.84,0.16,0.0,7.1,15.8,11.5,20.0,2.8,0.85,1.0208,1.63,0.19,6.1,13.975,9.7,18.8,7.0,1.9,0.86,1.0218,2.16,0.24,8.6,17.4,12.4,23.5,7.0,5.1,0.88,1.0221,1.8,0.22,7.2,15.2,10.9,19.875,6.0,2.8,0.73,1.0216,2.78,0.01,11.0,21.2,15.4,27.2,7.0,0.86,2.09,0.27,0.0,8.5,15.3,10.5,20.8,8.0,3.6,0.86,1.0197,1.76,0.2,4.0,8.0,13.4,9.6,18.0,3.0,2.9,0.73,1.0152,7.1,19.7,7.0,9.2,0.96,1.0367,2.17,0.71,8.5,0.1,-1.9,2.1,7.0,1.0195,1.85,0.13,8.8,14.1,10.6,18.3,4.8,0.87,1.0214,2.21,0.18,16.5,11.6,21.7,7.0,0.87,1.0209,1.7,0.41,0.0,4.8,13.0,10.3,15.7
max,20221030.0,12.0,8.0,16.3,1.0,1.0452,4.56,8.5,49.0,16.8,29.2,22.4,38.6,10.0,0.99,1.046,3.81,10.98,14.3,34.6,27.7,43.6,8.0,1.0,1.0454,3.79,11.54,15.2,33.1,26.0,40.1,8.0,12.9,1.0,1.0467,10.4,6.39,15.8,29.7,22.4,37.5,8.0,29.5,1.0,1.0473,4.19,5.74,27.0,16.0,31.2,24.9,40.7,8.0,1.0,8.13,95.0,27.6,24.2,36.0,9.0,1.0,1.0477,4.02,6.18,22.0,15.8,30.9,22.3,40.2,33.0,1.0,1.0461,3.54,8.23,15.5,28.4,21.4,36.7,8.0,10.3,1.0,1.0504,4.3,13.96,15.0,30.4,23.8,40.2,8.0,13.4,1.0,1.0472,10.39,8.72,15.5,30.9,23.2,39.6,8.0,15.3,0.99,1.076,3.73,8.7,14.6,33.4,26.2,40.7,8.0,1.0,3.73,9.79,50.0,15.7,29.5,22.0,37.0,8.0,14.1,1.0,1.0534,4.46,7.28,79.0,24.0,26.4,21.7,34.6,8.0,16.4,0.99,1.0391,13.8,31.5,9.0,29.6,1.0,1.062,4.62,10.22,15.9,12.7,10.4,15.3,9.0,1.0543,3.54,5.98,17.9,28.3,22.8,35.4,13.9,1.0,1.045,14.24,6.2,31.2,23.5,40.8,8.0,1.0,1.0463,3.98,90.0,3.0,15.8,23.6,19.5,28.4


In [22]:
# using IQR method
for col in df.columns:
    Q1 = np.percentile(df[col], 25, method='midpoint')
    Q3 = np.percentile(df[col], 75, method='midpoint')
    IQR = Q3 - Q1
    
    upper = Q3 + 1.5*IQR
    lower = Q1 - 1.5*IQR
    upper_array = np.array(df[col] >= upper)
    lower_array = np.array(df[col] <= lower)
    # print if the number of outliers exceed 5% of the total number of observations:
    if upper_array.sum() > 1147 or lower_array.sum() > 1147:
        print(f'lower {lower} upper {upper}') 
        print(f'{col} has {upper_array.sum()} upper outliers and {lower_array.sum()} lower outliers.')

lower 2.1 upper 2.1
BASEL_wind_speed has 18035 upper outliers and 19917 lower outliers.
lower 1.00935 upper 1.0265499999999999
BASEL_pressure has 2119 upper outliers and 1875 lower outliers.
lower -0.33 upper 0.55
BASEL_precipitation has 3046 upper outliers and 0 lower outliers.
lower 0.0 upper 0.0
BASEL_snow_depth has 22946 upper outliers and 21429 lower outliers.
lower -0.15000000000000002 upper 0.25
BELGRADE_precipitation has 4156 upper outliers and 0 lower outliers.
lower 1.0 upper 9.0
BUDAPEST_cloud_cover has 0 upper outliers and 1896 lower outliers.
lower 0.51 upper 0.8299999999999998
BUDAPEST_humidity has 2244 upper outliers and 2037 lower outliers.
lower 1.0100499999999997 upper 1.02325
BUDAPEST_pressure has 2841 upper outliers and 2429 lower outliers.
lower -0.12 upper 0.2
BUDAPEST_precipitation has 3704 upper outliers and 0 lower outliers.
lower -0.375 upper 0.625
DEBILT_precipitation has 2816 upper outliers and 0 lower outliers.
lower -0.36 upper 0.6
DUSSELDORF_precipitation

There is a disparate amount of outliers for each variable, some with a very large number of outliers. 
There are too many to consider handling, but this would be a good reference when trying machine learning later.
Snow depth and precipitation seem to have consistently high numbers of outliers. 

## 4. Scaling

In [11]:
df.dtypes

DATE                             int64
MONTH                            int64
BASEL_cloud_cover                int64
BASEL_wind_speed               float64
BASEL_humidity                 float64
BASEL_pressure                 float64
BASEL_global_radiation         float64
BASEL_precipitation            float64
BASEL_snow_depth                 int64
BASEL_sunshine                 float64
BASEL_temp_mean                float64
BASEL_temp_min                 float64
BASEL_temp_max                 float64
BELGRADE_cloud_cover             int64
BELGRADE_humidity              float64
BELGRADE_pressure              float64
BELGRADE_global_radiation      float64
BELGRADE_precipitation         float64
BELGRADE_sunshine              float64
BELGRADE_temp_mean             float64
BELGRADE_temp_min              float64
BELGRADE_temp_max              float64
BUDAPEST_cloud_cover             int64
BUDAPEST_humidity              float64
BUDAPEST_pressure              float64
BUDAPEST_global_radiation

All columns are numeric, but the date and month may not be useful anymore when they are scaled. 
But, looking ahead at the next tasks, I realize we do need the dates to filter the year for analysis. 

### Scale

In [12]:
# create a list of num columns
columns = df.columns.tolist()
num_col = columns[2:]
print(len(num_col))
num_col

168


['BASEL_cloud_cover',
 'BASEL_wind_speed',
 'BASEL_humidity',
 'BASEL_pressure',
 'BASEL_global_radiation',
 'BASEL_precipitation',
 'BASEL_snow_depth',
 'BASEL_sunshine',
 'BASEL_temp_mean',
 'BASEL_temp_min',
 'BASEL_temp_max',
 'BELGRADE_cloud_cover',
 'BELGRADE_humidity',
 'BELGRADE_pressure',
 'BELGRADE_global_radiation',
 'BELGRADE_precipitation',
 'BELGRADE_sunshine',
 'BELGRADE_temp_mean',
 'BELGRADE_temp_min',
 'BELGRADE_temp_max',
 'BUDAPEST_cloud_cover',
 'BUDAPEST_humidity',
 'BUDAPEST_pressure',
 'BUDAPEST_global_radiation',
 'BUDAPEST_precipitation',
 'BUDAPEST_sunshine',
 'BUDAPEST_temp_mean',
 'BUDAPEST_temp_min',
 'BUDAPEST_temp_max',
 'DEBILT_cloud_cover',
 'DEBILT_wind_speed',
 'DEBILT_humidity',
 'DEBILT_pressure',
 'DEBILT_global_radiation',
 'DEBILT_precipitation',
 'DEBILT_sunshine',
 'DEBILT_temp_mean',
 'DEBILT_temp_min',
 'DEBILT_temp_max',
 'DUSSELDORF_cloud_cover',
 'DUSSELDORF_wind_speed',
 'DUSSELDORF_humidity',
 'DUSSELDORF_pressure',
 'DUSSELDORF_global_

In [13]:
# create scaler object
scaler = StandardScaler()

In [14]:
# create a copy of original df
df_scaled = df.copy()
df_scaled.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [15]:
# scale num columns on copied df
df_scaled[num_col] = scaler.fit_transform(df_scaled[num_col])

In [16]:
df_scaled.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,0.660514,-0.02793,0.826097,-0.001949,-1.101066,-0.265148,-0.179228,-0.902918,...,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,-0.024706,0.372147,-0.668215,-0.519743,-0.752237
1,19600102,1,0.244897,-0.02793,0.73576,-0.001949,-1.058108,1.65876,-0.179228,-0.810126,...,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.024706,-0.829285,-0.548046,-0.629054,-0.407141
2,19600103,1,1.07613,-0.02793,1.277781,-0.001949,-1.25142,0.155707,-0.179228,-1.065304,...,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-0.024706,-1.0095,-0.067372,0.054135,-0.177078
3,19600104,1,-1.001953,-0.02793,1.458455,-0.001949,-0.821838,-0.445514,-0.179228,-0.114186,...,0.783085,0.480538,0.387574,-1.183432,0.669056,-0.024706,-1.039536,-0.998679,-0.164486,-0.838511
4,19600105,1,0.244897,-0.02793,1.729466,-0.001949,-0.746661,-0.164944,-0.179228,0.187388,...,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,-0.024706,0.672505,-1.509396,-1.339569,-1.471186


In [17]:
# compare to original
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


## 5. Export scaled data set

In [18]:
df_scaled.to_csv(os.path.join(path, 'Data Sets', 'weather_dataset_scaled.csv'), index=False)