# Exploración - Step 2: Transformación

In [1]:
import os
from math import cos, radians, sin
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

In [2]:
os.chdir("..")
os.getcwd()

'/home/troonies/repos/itba-aws-mle'

## Data

Detailed info on **flight arrivals and delays for U.S. airports**, categorized by carriers.

It includes metrics such as the number of arriving flights, delays over 15 minutes, cancellation and diversion counts, and the breakdown of delays attributed to carriers, weather, NAS (National Airspace System), security, and late aircraft arrivals.

In [3]:
cols_descriptions = {
    "Date":          "The date of observation",
    "Location":      "The common name of the location of the weather station",
    "MinTemp":       "The minimum temperature in degrees celsius",
    "MaxTemp":       "The maximum temperature in degrees celsius",
    "Rainfall":      "The amount of rainfall recorded for the day in mm",
    "Evaporation":   "The so-called Class A pan evaporation (mm) in the 24 hours to 9am",
    "Sunshine":      "The number of hours of bright sunshine in the day.",
    "WindGustDir":   "The direction of the strongest wind gust in the 24 hours to midnight",
    "WindGustSpeed": "The speed (km/h) of the strongest wind gust in the 24 hours to midnight",
    "WindDir9am":    "Direction of the wind at 9am",
    "WindDir3pm":    "Direction of the wind at 3pm",
    "WindSpeed9am":  "Wind speed (km/hr) averaged over 10 minutes prior to 9am",
    "WindSpeed3pm":  "Wind speed (km/hr) averaged over 10 minutes prior to 3pm",
    "Humidity9am":   "Humidity (percent) at 9am",
    "Humidity3pm":   "Humidity (percent) at 3pm",
    "Pressure9am":   "Atmospheric pressure (hpa) reduced to mean sea level at 9am",
    "Pressure3pm":   "Atmospheric pressure (hpa) reduced to mean sea level at 3pm",
    "Cloud9am":      "Fraction of sky obscured by cloud at 9am. This is measured in 'oktas', "
                     "which are a unit of eigths. It records how many eigths of the sky are obscured by cloud. "
                     "A 0 measure indicates completely clear sky whilst an 8 indicates that it is completely overcast.",
    "Cloud3pm":      "Fraction of sky obscured by cloud (in 'oktas': eighths) at 3pm. See Cload9am for a description of the values",
    "Temp9am":       "Temperature (degrees C) at 9am",
    "Temp3pm":       "Temperature (degrees C) at 3pm",
    "RainToday":     "Boolean: 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0",
    "RainTomorrow":  "The amount of next day rain in mm. Used to create response variable RainTomorrow. A kind of measure of the 'risk'",
}

In [4]:
cols_types = {
    "int": [
        "WindGustSpeed",
        "WindSpeed9am",
        "WindSpeed3pm",
        "Humidity9am",
        "Humidity3pm",
        "Cloud9am",
        "Cloud3pm",
    ],
    "float": [
        "MinTemp",
        "MaxTemp",
        "Rainfall",
        "Evaporation",
        "Sunshine",
        "WindGustDir",
        "WindDir9am",
        "WindDir3pm",
        "Pressure9am",
        "Pressure3pm",
        "Temp9am",
        "Temp3pm",
        "WindGustDir_east",  # engineered feature
        "WindGustDir_north",  # engineered feature
        "WindDir9am_east",  # engineered feature
        "WindDir9am_north",  # engineered feature
        "WindDir3pm_east",  # engineered feature
        "WindDir3pm_north",  # engineered feature
    ],
    "bool": [
        "RainToday",
    ],
    "cat": [
        "Location",
    ],
}
# ? "RainTomorrow" is not included because it's the target

assert sum(len(cts) for cts in cols_types.values()) == len(set(cols_types["int"]) | set(cols_types["float"]) | set(cols_types["bool"]) | set(cols_types["cat"]))

In [5]:
SELECTED_COLS = [
    "Date",
    # "Location",  # ! not needed for this use case
    "MinTemp",
    "MaxTemp",
    "Rainfall",
    # "Evaporation",  # ! too many nulls
    # "Sunshine",  # ! too many nulls
    "WindGustDir",  # ? will be transformed
    "WindGustSpeed",
    "WindDir9am",  # ? will be transformed
    "WindDir3pm",  # ? will be transformed
    "WindSpeed9am",
    "WindSpeed3pm",
    "Humidity9am",
    "Humidity3pm",
    "Pressure9am",
    "Pressure3pm",
    # "Cloud9am",  # ! too many nulls
    # "Cloud3pm",  # ! too many nulls
    "Temp9am",
    "Temp3pm",
    "RainToday",  # ? will be transformed
    "RainTomorrow",  # * target variable (will be transformed)
]

Use training and validation splits only for the EDA.

In [6]:
df = pd.read_csv("data/train_val.csv", usecols=SELECTED_COLS)
df

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2012-01-26,20.0,31.6,0.4,ESE,39.0,E,ENE,17.0,9.0,69.0,55.0,1007.9,1005.5,25.2,29.9,No,No
1,2010-07-12,11.2,18.2,0.0,WNW,28.0,SW,SE,7.0,11.0,58.0,62.0,1023.3,1020.7,14.3,17.3,No,No
2,2015-06-08,11.8,24.5,0.2,ENE,15.0,SW,NE,6.0,6.0,71.0,56.0,1028.0,1024.2,17.1,21.1,No,No
3,2012-04-05,14.6,19.1,9.8,SSW,50.0,SW,S,22.0,24.0,1.0,1.0,1020.9,1023.2,17.9,16.8,Yes,No
4,2016-03-05,15.4,19.5,3.8,ESE,30.0,SE,SSE,13.0,17.0,97.0,80.0,1021.6,1020.1,16.9,18.2,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127968,2009-01-18,10.1,29.7,0.0,ENE,33.0,WSW,NW,4.0,13.0,63.0,26.0,1020.2,1016.1,14.4,26.8,No,No
127969,2016-09-28,6.2,12.9,14.0,SSW,59.0,S,S,17.0,15.0,75.0,71.0,1021.5,1023.1,10.2,10.6,Yes,No
127970,2016-11-20,18.0,30.0,1.8,NE,28.0,N,ENE,9.0,13.0,83.0,47.0,1021.5,1017.0,19.9,28.8,Yes,No
127971,2012-11-04,9.5,32.9,0.0,NW,46.0,E,N,2.0,20.0,84.0,24.0,1013.0,1007.2,15.6,31.2,No,No


## Transformations

In [7]:
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2012-01-26,20.0,31.6,0.4,ESE,39.0,E,ENE,17.0,9.0,69.0,55.0,1007.9,1005.5,25.2,29.9,No,No
1,2010-07-12,11.2,18.2,0.0,WNW,28.0,SW,SE,7.0,11.0,58.0,62.0,1023.3,1020.7,14.3,17.3,No,No
2,2015-06-08,11.8,24.5,0.2,ENE,15.0,SW,NE,6.0,6.0,71.0,56.0,1028.0,1024.2,17.1,21.1,No,No
3,2012-04-05,14.6,19.1,9.8,SSW,50.0,SW,S,22.0,24.0,1.0,1.0,1020.9,1023.2,17.9,16.8,Yes,No
4,2016-03-05,15.4,19.5,3.8,ESE,30.0,SE,SSE,13.0,17.0,97.0,80.0,1021.6,1020.1,16.9,18.2,Yes,No


### Functions

In [8]:
def cast_everything_to_float(df: pd.DataFrame) -> pd.DataFrame:
    """Cast all numeric columns to float."""
    df = df.astype({c: float for c in cols_types["int"] if c in df.columns})
    df = df.astype({c: float for c in cols_types["float"] if c in df.columns})
    df = df.astype({c: float for c in cols_types["bool"] if c in df.columns})
    return df

def drop_null_rows(df: pd.DataFrame, aceptable_nulls: int = 2) -> pd.DataFrame:
    """Drop rows that have too many nulls."""
    return df[df.isnull().sum(axis=1) <= aceptable_nulls].reset_index(drop=True)

def standardize_cols(df: pd.DataFrame, df_desc: pd.DataFrame) -> pd.DataFrame:
    """Standardize columns."""
    for c in (cols_types["int"] + cols_types["float"] + cols_types["bool"]):
        if c in df.columns:
            df[c] = df[c] - df_desc.at["mean", c]
            df[c] = df[c] / df_desc.at["std", c]
    return df

def limit_outliers(df: pd.DataFrame, df_whiskers: pd.DataFrame) -> pd.DataFrame:
    """Limit outlier values by rounding them to the corresponding boxplot whisker value."""
    for c in (cols_types["int"] + cols_types["float"]):  # boolean is not necessary
        if c not in df.columns:
            continue

        low_w = df_whiskers.at[c, "lower_whisker"]
        high_w = df_whiskers.at[c, "upper_whisker"]

        df[f"{c}_lower_out"] = df[c] < low_w
        df[f"{c}_high_out"] = high_w < df[c]

        if df[f"{c}_lower_out"].any():
            df[c] = df[[c, f"{c}_lower_out"]].apply(lambda row: low_w if row[f"{c}_lower_out"] else row[c], axis=1)
        del df[f"{c}_lower_out"], low_w

        if df[f"{c}_high_out"].any():
            df[c] = df[[c, f"{c}_high_out"]].apply(lambda row: high_w if row[f"{c}_high_out"] else row[c], axis=1)
        del df[f"{c}_high_out"], high_w

    return df

In [9]:
ANGLE_STEP = 22.5
wind_dir_to_radians = {
    "E":   radians(0.0),
    "ENE": radians(ANGLE_STEP),
    "NE":  radians(2.0 * ANGLE_STEP),
    "NNE": radians(3.0 * ANGLE_STEP),
    "N":   radians(4.0 * ANGLE_STEP),
    "NNW": radians(5.0 * ANGLE_STEP),
    "NW":  radians(6.0 * ANGLE_STEP),
    "WNW": radians(7.0 * ANGLE_STEP),
    "W":   radians(8.0 * ANGLE_STEP),
    "WSW": radians(9.0 * ANGLE_STEP),
    "SW":  radians(10.0 * ANGLE_STEP),
    "SSW": radians(11.0 * ANGLE_STEP),
    "S":   radians(12.0 * ANGLE_STEP),
    "SSE": radians(13.0 * ANGLE_STEP),
    "SE":  radians(14.0 * ANGLE_STEP),
    "ESE": radians(15.0 * ANGLE_STEP),
}
wind_dir_to_radians

{'E': 0.0,
 'ENE': 0.39269908169872414,
 'NE': 0.7853981633974483,
 'NNE': 1.1780972450961724,
 'N': 1.5707963267948966,
 'NNW': 1.9634954084936207,
 'NW': 2.356194490192345,
 'WNW': 2.748893571891069,
 'W': 3.141592653589793,
 'WSW': 3.5342917352885173,
 'SW': 3.9269908169872414,
 'SSW': 4.319689898685966,
 'S': 4.71238898038469,
 'SSE': 5.105088062083414,
 'SE': 5.497787143782138,
 'ESE': 5.8904862254808625}

In [11]:
def fill_nulls(df: pd.DataFrame, df_desc: pd.DataFrame) -> pd.DataFrame:
    """Fill nulls programatically."""
    null_counts = df.isnull().sum()

    for c in (cols_types["int"] + cols_types["float"]):
        if (c not in df.columns) or (null_counts.at[c] == 0):
            continue
        df[c] = df[c].fillna(df_desc.at["mean", c])

    for c in cols_types["bool"]:
        if (c not in df.columns) or (null_counts.at[c] == 0):
            continue

        # We fill with the most repeated boolean value (most likely a std float by now)
        col_vc = df[c].value_counts()
        df[c] = df[c].fillna(col_vc.iat[0])

    return df

### Apply transformations

In [12]:
df_new = df.copy()

In [13]:
df_new["Date"] = pd.to_datetime(df_new["Date"])

df_new["RainToday"] = df_new["RainToday"].map(lambda rt: np.nan if pd.isna(rt) else (rt == "Yes"))
df_new["RainTomorrow"] = df_new["RainTomorrow"] == "Yes"
df_new = df_new[["RainTomorrow", "Date"] + [c for c in df_new.columns if c not in ["RainTomorrow", "Date"]]]

wind_cols = ["WindGustDir", "WindDir9am", "WindDir3pm"]
for c in wind_cols:
    df_new[f"{c}_rad"] = df_new[c].map(wind_dir_to_radians)
    df_new[f"{c}_east"] = df_new[f"{c}_rad"].map(lambda r: cos(r))
    df_new[f"{c}_north"] = df_new[f"{c}_rad"].map(lambda r: sin(r))
    df_new = df_new.drop([c, f"{c}_rad"], axis=1)

In [14]:
df_new = cast_everything_to_float(df_new)
df_new = drop_null_rows(df_new)

In [15]:
df_desc_orig = df_new.describe()
df_desc_orig

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_east,WindGustDir_north,WindDir9am_east,WindDir9am_north,WindDir3pm_east,WindDir3pm_north
count,117590,117542.0,117574.0,116795.0,117590.0,117590.0,117590.0,116946.0,116389.0,109633.0,109681.0,117515.0,117005.0,116795.0,117579.0,117579.0,112120.0,112120.0,117232.0,117232.0
mean,2013-04-16 16:35:27.609490944,12.19223,23.352066,2.305087,40.086232,14.300272,18.843728,68.451482,50.984414,1017.621875,1015.21241,17.030144,21.786366,0.220549,-0.01953347,-0.050254,0.02703677,0.009904,-0.02380697,-0.042755
min,2007-11-01 00:00:00,-8.5,-4.8,0.0,6.0,0.0,0.0,0.0,0.0,982.0,977.1,-7.2,-5.4,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,2011-01-31 00:00:00,7.5,17.9,0.0,31.0,7.0,13.0,56.0,36.0,1012.9,1010.4,12.2,16.6,0.0,-0.7071068,-0.707107,-0.7071068,-0.707107,-0.7071068,-0.707107
50%,2013-06-16 00:00:00,12.0,22.8,0.0,39.0,13.0,19.0,69.0,51.0,1017.6,1015.2,16.7,21.3,0.0,-1.83697e-16,0.0,6.123234000000001e-17,0.0,-1.83697e-16,0.0
75%,2015-06-21 00:00:00,16.9,28.5,0.6,48.0,20.0,24.0,83.0,65.0,1022.4,1020.0,21.7,26.6,0.0,0.7071068,0.707107,0.7071068,0.707107,0.7071068,0.707107
max,2017-06-25 00:00:00,33.9,48.1,367.6,135.0,87.0,87.0,100.0,100.0,1041.0,1039.6,40.2,46.7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,,6.445237,7.155305,8.318535,13.490377,8.715115,8.620551,19.20653,20.838074,7.089881,7.020179,6.550717,6.981686,0.414619,0.7120533,0.700058,0.6922264,0.721112,0.710009,0.702496


In [16]:
df_new = standardize_cols(df_new, df_desc=df_desc_orig)

In [17]:
df_new.head()

Unnamed: 0,RainTomorrow,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_east,WindGustDir_north,WindDir9am_east,WindDir9am_north,WindDir3pm_east,WindDir3pm_north
0,False,2012-01-26,1.211401,1.152702,-0.229017,-0.080519,0.309775,-1.141891,0.028559,0.192704,-1.371232,-1.383499,1.24717,1.162131,-0.531932,1.324919,-0.474859,1.405556,-0.013734,1.334753,0.60561
1,False,2010-07-12,-0.153948,-0.720034,-0.277102,-0.895915,-0.837656,-0.909887,-0.544163,0.528628,0.800877,0.781688,-0.41677,-0.642591,-0.531932,-1.270054,0.618431,-1.060554,-0.994312,1.029443,-0.945701
2,False,2015-06-08,-0.060856,0.160431,-0.25306,-1.859565,-0.9524,-1.489897,0.13269,0.240693,1.463794,1.280251,0.010664,-0.09831,-0.531932,1.324919,0.618431,-1.060554,-0.994312,1.029443,1.067425
3,False,2012-04-05,0.373574,-0.594254,0.90099,0.734877,0.883491,0.598137,-3.511904,-2.398706,0.462367,1.137804,0.132788,-0.714207,1.879924,-0.510004,-1.247932,-1.060554,-0.994312,0.033531,-1.362633
4,False,2016-03-05,0.497696,-0.538351,0.179709,-0.747661,-0.149197,-0.213876,1.486396,1.392431,0.561099,0.69622,-0.019867,-0.513682,1.879924,1.324919,-0.474859,0.982439,-0.994312,0.572514,-1.254276


In [18]:
df_desc = df_new.describe()
df_desc.loc["iqr", :] = df_desc.loc["75%", :] -  df_desc.loc["25%", :]
df_desc.loc["iqr_x_1_5", :] = 1.5 * df_desc.loc["iqr", :]
df_desc.loc["lower_whisker", :] = df_desc.loc["25%", :] - df_desc.loc["iqr_x_1_5", :]
df_desc.loc["upper_whisker", :] = df_desc.loc["75%", :] + df_desc.loc["iqr_x_1_5", :]
df_desc

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_east,WindGustDir_north,WindDir9am_east,WindDir9am_north,WindDir3pm_east,WindDir3pm_north
count,117590,117542.0,117574.0,116795.0,117590.0,117590.0,117590.0,116946.0,116389.0,109633.0,109681.0,117515.0,117005.0,116795.0,117579.0,117579.0,112120.0,112120.0,117232.0,117232.0
mean,2013-04-16 16:35:27.609490944,-5.3165880000000005e-17,8.29452e-16,-1.2289020000000001e-17,-1.1239130000000001e-17,-5.861268e-17,1.556559e-16,-2.875078e-16,1.225863e-16,1.562537e-14,-3.844028e-14,1.510088e-16,-3.333943e-16,-9.320189000000001e-17,-2.23595e-18,7.856042e-18,4.689633e-17,2.3067920000000003e-17,-1.3122060000000001e-17,1.3091750000000001e-17
min,2007-11-01 00:00:00,-3.210468,-3.934433,-0.2771025,-2.526707,-1.640859,-2.185908,-3.563969,-2.446695,-5.024326,-5.42898,-3.698854,-3.893954,-0.5319319,-1.376957,-1.356667,-1.483672,-1.400481,-1.374902,-1.362633
25%,2011-01-31 00:00:00,-0.7280151,-0.7619614,-0.2771025,-0.6735343,-0.8376564,-0.6778834,-0.6482942,-0.7190883,-0.666002,-0.6855109,-0.7373458,-0.742853,-0.5319319,-0.9656205,-0.9382828,-1.060554,-0.994312,-0.9623819,-0.9457009
50%,2013-06-16 00:00:00,-0.02982513,-0.07715478,-0.2771025,-0.08051901,-0.1491974,0.01812782,0.02855894,0.0007479418,-0.003085355,-0.001767708,-0.05039818,-0.06966317,-0.5319319,0.02743259,0.07178576,-0.0390577,-0.01373366,0.03353052,0.0608622
75%,2015-06-21 00:00:00,0.7304262,0.719457,-0.2049744,0.5866232,0.6540049,0.5981372,0.7574777,0.6725951,0.6739359,0.6819755,0.712877,0.6894657,-0.5319319,1.020486,1.081854,0.9824387,0.9668447,1.029443,1.067425
max,2017-06-25 00:00:00,3.368033,3.458684,43.91337,7.035664,8.341798,7.906255,1.642593,2.352213,3.297393,3.473927,3.536995,3.568426,1.879924,1.431822,1.500238,1.405556,1.373014,1.441963,1.484357
std,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
iqr,1602 days 00:00:00,1.458441,1.481418,0.07212808,1.260157,1.491661,1.276021,1.405772,1.391683,1.339938,1.367486,1.450223,1.432319,0.0,1.986106,2.020137,2.042993,1.961157,1.991825,2.013126
iqr_x_1_5,2403 days 00:00:00,2.187662,2.222128,0.1081921,1.890236,2.237492,1.914031,2.108658,2.087525,2.009907,2.05123,2.175334,2.148478,0.0,2.979159,3.030206,3.064489,2.941735,2.987737,3.019689


In [19]:
df_whiskers = df_desc.loc[["lower_whisker", "upper_whisker"],:].T
df_whiskers

Unnamed: 0,lower_whisker,upper_whisker
Date,2004-07-03 00:00:00,2022-01-18 00:00:00
MinTemp,-2.915677,2.918088
MaxTemp,-2.984089,2.941585
Rainfall,-0.385295,-0.096782
WindGustSpeed,-2.56377,2.476859
WindSpeed9am,-3.075148,2.891497
WindSpeed3pm,-2.591914,2.512168
Humidity9am,-2.756952,2.866135
Humidity3pm,-2.806613,2.76012
Pressure9am,-2.675909,2.683843


In [20]:
df_new = limit_outliers(df_new, df_whiskers=df_whiskers)

In [21]:
df_new

Unnamed: 0,RainTomorrow,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_east,WindGustDir_north,WindDir9am_east,WindDir9am_north,WindDir3pm_east,WindDir3pm_north
0,False,2012-01-26,1.211401,1.152702,-0.229017,-0.080519,0.309775,-1.141891,0.028559,0.192704,-1.371232,-1.383499,1.247170,1.162131,-0.531932,1.324919,-0.474859,1.405556,-0.013734,1.334753,0.605610
1,False,2010-07-12,-0.153948,-0.720034,-0.277102,-0.895915,-0.837656,-0.909887,-0.544163,0.528628,0.800877,0.781688,-0.416770,-0.642591,-0.531932,-1.270054,0.618431,-1.060554,-0.994312,1.029443,-0.945701
2,False,2015-06-08,-0.060856,0.160431,-0.253060,-1.859565,-0.952400,-1.489897,0.132690,0.240693,1.463794,1.280251,0.010664,-0.098310,-0.531932,1.324919,0.618431,-1.060554,-0.994312,1.029443,1.067425
3,False,2012-04-05,0.373574,-0.594254,-0.096782,0.734877,0.883491,0.598137,-2.756952,-2.398706,0.462367,1.137804,0.132788,-0.714207,1.879924,-0.510004,-1.247932,-1.060554,-0.994312,0.033531,-1.362633
4,False,2016-03-05,0.497696,-0.538351,-0.096782,-0.747661,-0.149197,-0.213876,1.486396,1.392431,0.561099,0.696220,-0.019867,-0.513682,1.879924,1.324919,-0.474859,0.982439,-0.994312,0.572514,-1.254276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117585,False,2009-01-18,-0.324616,0.887165,-0.277102,-0.525280,-1.181886,-0.677883,-0.283835,-1.198979,0.363634,0.126434,-0.401505,0.718112,-0.531932,1.324919,0.618431,-1.373707,-0.544419,-0.962382,1.067425
117586,False,2016-09-28,-0.929714,-1.460744,-0.096782,1.402019,0.309775,-0.445880,0.340953,0.960530,0.546994,1.123560,-1.042656,-1.602244,1.879924,-0.510004,-1.247932,-0.039058,-1.400481,0.033531,-1.362633
117587,False,2016-11-20,0.901095,0.929092,-0.096782,-0.895915,-0.608170,-0.677883,0.757478,-0.191208,0.546994,0.254636,0.438098,1.004576,1.879924,1.020486,1.081854,-0.039058,1.373014,1.334753,0.605610
117588,False,2012-11-04,-0.417708,1.334385,-0.277102,0.438369,-1.411372,0.134130,0.809543,-1.294957,-0.651897,-1.141340,-0.218319,1.348332,-0.531932,-0.965621,1.081854,1.405556,-0.013734,0.033531,1.484357


We should do the standardization again with the original data and with mean and std calculated anew, but we'll skip this step

In [22]:
df_new = fill_nulls(df_new, df_desc=df_desc)

In [23]:
df_new.isnull().sum()

RainTomorrow         0
Date                 0
MinTemp              0
MaxTemp              0
Rainfall             0
WindGustSpeed        0
WindSpeed9am         0
WindSpeed3pm         0
Humidity9am          0
Humidity3pm          0
Pressure9am          0
Pressure3pm          0
Temp9am              0
Temp3pm              0
RainToday            0
WindGustDir_east     0
WindGustDir_north    0
WindDir9am_east      0
WindDir9am_north     0
WindDir3pm_east      0
WindDir3pm_north     0
dtype: int64

In [24]:
df_new

Unnamed: 0,RainTomorrow,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainToday,WindGustDir_east,WindGustDir_north,WindDir9am_east,WindDir9am_north,WindDir3pm_east,WindDir3pm_north
0,False,2012-01-26,1.211401,1.152702,-0.229017,-0.080519,0.309775,-1.141891,0.028559,0.192704,-1.371232,-1.383499,1.247170,1.162131,-0.531932,1.324919,-0.474859,1.405556,-0.013734,1.334753,0.605610
1,False,2010-07-12,-0.153948,-0.720034,-0.277102,-0.895915,-0.837656,-0.909887,-0.544163,0.528628,0.800877,0.781688,-0.416770,-0.642591,-0.531932,-1.270054,0.618431,-1.060554,-0.994312,1.029443,-0.945701
2,False,2015-06-08,-0.060856,0.160431,-0.253060,-1.859565,-0.952400,-1.489897,0.132690,0.240693,1.463794,1.280251,0.010664,-0.098310,-0.531932,1.324919,0.618431,-1.060554,-0.994312,1.029443,1.067425
3,False,2012-04-05,0.373574,-0.594254,-0.096782,0.734877,0.883491,0.598137,-2.756952,-2.398706,0.462367,1.137804,0.132788,-0.714207,1.879924,-0.510004,-1.247932,-1.060554,-0.994312,0.033531,-1.362633
4,False,2016-03-05,0.497696,-0.538351,-0.096782,-0.747661,-0.149197,-0.213876,1.486396,1.392431,0.561099,0.696220,-0.019867,-0.513682,1.879924,1.324919,-0.474859,0.982439,-0.994312,0.572514,-1.254276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117585,False,2009-01-18,-0.324616,0.887165,-0.277102,-0.525280,-1.181886,-0.677883,-0.283835,-1.198979,0.363634,0.126434,-0.401505,0.718112,-0.531932,1.324919,0.618431,-1.373707,-0.544419,-0.962382,1.067425
117586,False,2016-09-28,-0.929714,-1.460744,-0.096782,1.402019,0.309775,-0.445880,0.340953,0.960530,0.546994,1.123560,-1.042656,-1.602244,1.879924,-0.510004,-1.247932,-0.039058,-1.400481,0.033531,-1.362633
117587,False,2016-11-20,0.901095,0.929092,-0.096782,-0.895915,-0.608170,-0.677883,0.757478,-0.191208,0.546994,0.254636,0.438098,1.004576,1.879924,1.020486,1.081854,-0.039058,1.373014,1.334753,0.605610
117588,False,2012-11-04,-0.417708,1.334385,-0.277102,0.438369,-1.411372,0.134130,0.809543,-1.294957,-0.651897,-1.141340,-0.218319,1.348332,-0.531932,-0.965621,1.081854,1.405556,-0.013734,0.033531,1.484357
