# Preprocessing

## Load Data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import seaborn as sns
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder
import datetime
import pickle
import math
from tqdm import tqdm
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import cross_val_score,KFold

pd.set_option('display.max_columns', None) # no truncate columns

In [2]:
# Environment settings
data_path_out = 'Data/output/'
    
# Deserialize previously saved data from "data-visualization"
df = {}
for df_name in ['train', 'test']:
    with open(data_path_out+df_name+'_dv.obj', 'rb') as file:
        df[df_name] = pickle.load(file)

In [3]:
X = df['train'].copy()
X = X.drop(columns=['NumberOfCustomers', 'NumberOfSales'], axis=1)
y = df['train'].loc[:, 'NumberOfCustomers':'NumberOfSales']

all_data = pd.concat([X, df['test']], axis=0).reset_index()

#to restore the original column order
all_data = all_data[list(df['test'].columns.values)]

## Drop columns

### Max_Gust_SpeedKm_h

In [4]:
all_data = all_data.drop('Max_Gust_SpeedKm_h', axis = 1)
df['train'] = df['train'].drop('Max_Gust_SpeedKm_h', axis = 1)
df['test'] = df['test'].drop('Max_Gust_SpeedKm_h', axis = 1)

## Fill missing values

### Events

In [5]:
all_data.Events = all_data.Events.fillna('Normal')
print ("There are {} missing values of Events left".format(all_data['Events'].isna().sum(),
                                                                  all_data['Events'].isna().sum()/
                                                                  all_data.shape[0]*100))

There are 0 missing values of Events left


### Visibility

Since the Visibility has a high negative correlation  with Humidity, a good imputation method could be to group data by Humidity and use the median value to fill in missing values.

Intuitively the minimum visibility corresponds to the maximum humidity and viceversa.

Some humidity levels could not have visiblity values, so a good solution could be to take the mean between the two adiacent values

In [6]:
min_vis_dict=all_data.groupby('Max_Humidity')['Min_VisibilitykM'].apply(lambda x: x.median()).to_dict()
max_vis_dict=all_data.groupby('Min_Humidity')['Max_VisibilityKm'].apply(lambda x: x.median()).to_dict()
mean_vis_dict=all_data.groupby('Mean_Humidity')['Mean_VisibilityKm'].apply(lambda x: x.median()).to_dict()

#Function that replaces missing values of the median with the mean of next and previous values
def replace_nan_median(visdict_list):
    for visdict in visdict_list:
        item_list=[visdict[x] for x in visdict.keys()]
        for val in range(len(item_list)):
            if(math.isnan(item_list[val])):
                prev_val = (val-1) < 0 and item_list[val+1]  or item_list[val-1]
                next_val = (val+1) > len(item_list) and item_list[val-1] or item_list[val+1]
                item_list[val] = float(int((prev_val + next_val)/2))
        iterator = 0
        for k,i in visdict.items():
            visdict[k]=item_list[iterator]
            iterator+=1

#Function that replaces nan in the dataframe with the dictionary
#value corresponding to the samples humidity level

def replace_nan_df(col_name1,col_name2,dictionary):
    nan_index=all_data[all_data[col_name1].isnull()].index.tolist()
    for index in tqdm(nan_index):
        humidty_lvl = all_data.loc[index,col_name2] 
        all_data.loc[index,col_name1] = dictionary[humidty_lvl]

    
        
replace_nan_median([min_vis_dict,max_vis_dict,mean_vis_dict])

#Replacing nan in dataframe 
replace_nan_df('Min_VisibilitykM','Max_Humidity',min_vis_dict)
replace_nan_df('Max_VisibilityKm','Min_Humidity',max_vis_dict)
replace_nan_df('Mean_VisibilityKm','Mean_Humidity',mean_vis_dict)

for col in ["Min_VisibilitykM","Max_VisibilityKm","Mean_VisibilityKm"]:
    print ("There are {} missing values of {} left".
           format(all_data[col].isna().sum(),col))
os.system('say "Visibility Done."')

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
100%|██████████| 13959/13959 [00:26<00:00, 530.76it/s]
100%|██████████| 13959/13959 [00:29<00:00, 476.73it/s]
100%|██████████| 13959/13959 [00:27<00:00, 514.91it/s]

There are 0 missing values of Min_VisibilitykM left
There are 0 missing values of Max_VisibilityKm left
There are 0 missing values of Mean_VisibilityKm left





32512

### Cloud Cover [ERROR]

In [7]:
#substitute with regression below
all_data['CloudCover'] = all_data['CloudCover'].fillna(all_data['CloudCover'].median())

In [8]:
rf =  RandomForestClassifier(n_estimators=50,verbose=True)

cloud_data = all_data.copy()
cloud_data = pd.get_dummies(cloud_data)
cloud_data['Date_delta'] = (cloud_data['Date'] - cloud_data['Date'].min())  / np.timedelta64(1,'D')
cloud_data=cloud_data.drop('Date',axis=1)
nan_indexes=cloud_data[cloud_data['CloudCover'].isnull()].index.tolist()
good_indexes =cloud_data[~cloud_data['CloudCover'].isnull()].index.tolist()

train_cloud = cloud_data.loc[good_indexes]
test_cloud = cloud_data.loc[nan_indexes]
print(train_cloud.shape)
print(test_cloud.shape)

X_train_cloud = train_cloud.copy()
X_train_cloud = X_train_cloud.drop(['CloudCover'], axis=1)
y_train_cloud = train_cloud['CloudCover']
test_cloud = test_cloud.drop('CloudCover',axis=1)

scaler = RobustScaler()
X_train_cloud = scaler.fit_transform(X_train_cloud)
test_cloud = scaler.transform(test_cloud)

# kf = KFold(5, shuffle=True, random_state=42).get_n_splits(X_train_cloud)
# score_CV= cross_val_score(rf, X_train_cloud, y_train_cloud, cv = kf)

# print("The 10-fold crossvalidation of RF is {:.5f} +/- {:.3f}".format(score_CV.mean(),score_CV.std()))
# os.system('say "Crossvalidation done."')

(568710, 69)
(0, 69)


ValueError: Found array with 0 sample(s) (shape=(0, 68)) while a minimum of 1 is required by RobustScaler.

In [None]:
# rf.fit(X_train_cloud,y_train_cloud)
# pred_results =rf.predict(test_cloud)
nan_index=cloud_data[cloud_data['CloudCover'].isnull()].index.tolist()
result_df = pd.DataFrame(data={' Index' : nan_index,
                               'CloudCover': pred_results})

## Region 2 missing values

### Number of sales

In [None]:
df['test']