- Correlation
- Correlation at other (closest) places
- Median at other (closest) places

In [2]:
from scipy.stats import spearmanr

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from geopy.geocoders import Nominatim
import folium
from folium.plugins import HeatMap
import re
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from pandas_profiling import ProfileReport
from category_encoders.target_encoder import TargetEncoder
import umap
from sklearn.manifold import TSNE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import shap
import pickle

  from pandas_profiling import ProfileReport


In [40]:
class OutlierDetection:
    def __init__(self):
        pass
    
    def z_score(self, X, threshold=3):
        numerical = [column for column in train.columns if train.dtypes[column] == float]
        outliers = {}
        for col in numerical:
            series_ = X[col].dropna()
            mean = np.mean(series_)
            std = np.std(series_)
            outliers_ = series_.apply(lambda x: (x-mean)/std > threshold)
            outliers[col] = series_[outliers_].index.values
        return outliers

In [4]:
standard_roll_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                      'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
                      'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am',
                      'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday']
standard_roll_strategies = ['mean'] * (len(standard_roll_cols) - 1) + ['sum']
standard_roll_period = 7

In [41]:
class Preprocessor:
    def __init__(self, numerical_impute_strategy, categorical_impute_strategy, roll_cols, roll_strategies, roll_period):
        if len(roll_cols) != len(roll_strategies):
            raise ValueError('Value Error: len(roll_cols) != len(roll_strategies).')
        self.locations = None
        self.numerical_impute_strategy = numerical_impute_strategy
        self.categorical_impute_strategy = categorical_impute_strategy
        self.roll_cols = roll_cols
        self.roll_strategies = roll_strategies
        self.roll_period = roll_period

    def preprocess(self, _data, visualize=False):
        data = _data.copy(deep=True)
        
        detector = OutlierDetection()
        outliers = detector.z_score(data)
        outlier_indeces = set()
        for k in outliers.keys():
            outlier_indeces.update(outliers[k])
        
        data['is_outlier'] = 0
        data.loc[list(outlier_indeces), 'is_outlier'] = 1

        for column in ['RainToday', 'RainTomorrow']:
            data.loc[data[column] == 'Yes', column] = 1
            data.loc[data[column] == 'No', column] = 0

        data['Year'] = pd.DatetimeIndex(data['Date']).year
        data['Month'] = pd.DatetimeIndex(data['Date']).month

        if not visualize:
            self.impute(data)

        data['Location'] = data['Location'].apply(lambda loc: ' '.join(re.findall('[A-Z][^A-Z]+|[A-Z]+', loc)))
        data.loc[data.Location == 'Portland', 'Location'] = 'Portland, Victoria'
        data.loc[data.Location == 'Dartmoor', 'Location'] = 'Dartmoor, Victoria'
        data.loc[data.Location == 'Perth', 'Location'] = 'Perth, Western Australia'
        data.loc[data.Location == 'Richmond', 'Location'] = 'Richmond, New South Wales'
        data['Location'] = data.Location + ', Australia'

        if self.locations is None:
            geolocator = Nominatim(user_agent="rain-in-australia-app")
            locations = {'Location_reduced': [], 'Location': [], 'Address': [], 'Latitude': [], 'Longitude': []}
            for location in data.Location.unique().tolist() + ['Australia']:
                location_enc = geolocator.geocode(location, language='en')
                if location_enc is None:
                    raise ValueError(f'Location not found: {location}')
                locations['Location_reduced'] += [location.split(', ')[0]]
                locations['Location'] += [location]
                locations['Address'] += [location_enc.address]
                locations['Latitude'] += [location_enc.latitude]
                locations['Longitude'] += [location_enc.longitude]
            self.locations = pd.DataFrame(locations)
        data = data.merge(self.locations[['Location', 'Latitude', 'Longitude']], left_on='Location',
                          right_on='Location')

        if visualize:
            data['Location'] = data['Location'].apply(lambda loc: loc.split(', ')[0])

        data = self.RainToday_Locations(data)

        if not visualize:
            data = self.target_encoding(data)

        for i, col in enumerate(self.roll_cols):
            data[f'{col}_{self.roll_period}days'] = Preprocessor.rolling_features_for_all_locations(data[col],
                                                                                                    data.Location,
                                                                                                    period=self.roll_period,
                                                                                                    shift=1,
                                                                                                    strategy=
                                                                                                    self.roll_strategies[
                                                                                                        i],
                                                                                                    corr=False)
            data[f'{col}_{self.roll_period}days_corr'] = Preprocessor.rolling_features_for_all_locations(data[col],
                                                                                                    data.Location,
                                                                                                    period=4,
                                                                                                    shift=0,
                                                                                                    strategy=
                                                                                                    self.roll_strategies[
                                                                                                        i],
                                                                                                    corr=True)

        acc_col = [column for column in data.columns if 'RainToday_' in column and 'days' not in column or column == 'Month_Location']
        data['Accumulated_probabilities'] = data.loc[:, acc_col].sum(axis=1)
        return data

    def impute(self, data):
        columns = data.columns[2:-2]
        dtypes = data.dtypes[2:-2]
        numerical = [column for idx, column in enumerate(columns) if dtypes[idx] == float]
        categorical = [column for idx, column in enumerate(columns) if dtypes[idx] == object]
        if self.numerical_impute_strategy == 'mean':
            data[numerical] = data.groupby(['Month', 'Location'])[numerical].transform(lambda x: x.fillna(x.mean()))
        elif self.numerical_impute_strategy == 'median':
            data[numerical] = data.groupby(['Month', 'Location'])[numerical].transform(lambda x: x.fillna(x.median()))
        else:
            raise ValueError('Wrong numerical impute strategy.')

        if self.categorical_impute_strategy == 'mode':
            data[categorical] = data.groupby(['Month', 'Location'])[categorical].transform(lambda x: x.fillna(x.mode()))
        else:
            raise ValueError('Wrong categorical impute strategy.')

    def RainToday_Locations(self, data):
        grouped = data.groupby('Date')[['Location', 'RainToday']].apply(lambda r: r.set_index('Location').T)
        grouped = grouped.reset_index().drop('level_1', axis=1)
        if ',' in grouped.columns[1]:
            grouped.columns = ['Date'] + ['RainToday_' + col[:col.index(',')] for col in grouped.columns[1:]]
        else:
            grouped.columns = ['Date'] + ['RainToday_' + col for col in grouped.columns[1:]]
        return data.merge(grouped, left_on='Date', right_on='Date')

    def target_encoding(self, _data):
        data = _data.copy(deep=True)
        data['Month_Location'] = data['Month']
        location_cols = [column for column in data.columns if 'RainToday_' in column] + ['WindGustDir', 'WindDir9am',
                                                                                         'WindDir3pm', 'Month_Location']

        for location in data.Location.unique():
            location_encoder = TargetEncoder(cols=location_cols, handle_missing=0)
            location_df = data.loc[data.Location == location]
            data.loc[data.Location == location] = location_encoder.fit_transform(location_df, location_df.RainTomorrow)

        location_encoder = TargetEncoder(cols=['Location', 'Month'])
        encoded = location_encoder.fit_transform(data, data.RainTomorrow)

        for c in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
            encoded[c] = encoded[c].astype(float)
        return encoded

    @staticmethod
    def rolling_features_for_all_locations(series_, locations, period, shift, strategy, corr=False):

        def rolling_features_for_location(series_, period, shift, strategy, corr=False):
            if not corr:
                first_n = []
                for i in range(period):
                    if i == 0:
                        first_n += [series_[i]]
                    else:
                        if strategy == 'mean':
                            first_n += [series_[:i + 1 - shift].mean()]
                        elif strategy == 'median':
                            first_n += [series_[:i + 1 - shift].median()]
                        elif strategy == 'sum':
                            first_n += [series_[:i + 1 - shift].sum()]
                        else:
                            raise ValueError('Wrong strategy.')

                if strategy == 'mean':
                    new_series = series_.shift(shift).rolling(period).mean()
                elif strategy == 'median':
                    new_series = series_.shift(shift).rolling(period).median()
                elif strategy == 'sum':
                    new_series = series_.shift(shift).rolling(period).sum()
                else:
                    raise ValueError('Wrong strategy.')

                new_series[:period] = first_n

                return new_series
            else:
                return series_.shift(shift).rolling(period).corr(other=series_.index.to_series())

        all_locations = []
        for location in locations.unique():
            one_location, index = series_[locations == location].reset_index(drop=True, inplace=False), series_[
                locations == location].index.to_series()
            new_series_for_location = rolling_features_for_location(one_location, period, shift, strategy, corr=corr)
            new_series_for_location.index = index
            all_locations += [new_series_for_location]
        return pd.concat(all_locations, axis=0)

    @staticmethod
    def load_and_split():
        dataset = pd.read_csv('data/weatherAUS.csv')
        dataset.sort_values(['Date', 'Location'], inplace=True)
        dataset.reset_index(drop=True, inplace=True)
        train, test = dataset.iloc[:109103], dataset.iloc[109103:]
        return train, test


In [48]:
preprocessor = Preprocessor('median', 'mode', standard_roll_cols, standard_roll_strategies, standard_roll_period)
train, test = preprocessor.load_and_split()
train_prep = preprocessor.preprocess(train, visualize=False)

Mean of empty slice
Mean of empty slice


In [49]:
corrs = [column for column in train_prep.columns if '_corr' in column]

In [51]:
train_prep['Location_str'] = train.Location

In [93]:
grouped = train_prep.groupby(['Date'])[['Location_str']+corrs].apply(lambda r: r.set_index('Location_str').T)

In [98]:
grouped.index.names = ['Date', 'Features']

In [101]:
grouped = grouped.unstack(level='Features')

In [110]:
grouped.columns = ['_'.join(tupl) for tupl in grouped.columns.values]

In [112]:
grouped.reset_index(inplace=True)

In [116]:
train_prep = train_prep.merge(grouped, left_on='Date', right_on='Date')

In [114]:
train_prep

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Cloud3pm_7days,Cloud3pm_7days_corr,Temp9am_7days,Temp9am_7days_corr,Temp3pm_7days,Temp3pm_7days_corr,RainToday_7days,RainToday_7days_corr,Accumulated_probabilities,Location_str
0,2007-11-01,0.179851,8.0,24.3,0.0,3.4,6.3,0.180401,30.0,0.213197,...,7.000000,,14.400000,,23.600000,,0.0,,9.631074,Canberra
1,2007-11-02,0.179851,14.0,26.9,3.6,4.4,9.7,0.123810,39.0,0.198196,...,7.000000,,14.400000,,23.600000,,0.0,,9.837981,Canberra
2,2007-11-03,0.179851,13.7,23.4,3.6,5.8,3.3,0.180401,85.0,0.254777,...,5.000000,,15.950000,,24.650000,,1.0,,9.837981,Canberra
3,2007-11-04,0.179851,13.3,15.5,39.8,7.2,9.1,0.180401,54.0,0.116789,...,5.666667,0.258199,15.766667,-0.360586,23.166667,-0.866063,2.0,0.774597,9.837981,Canberra
4,2007-11-05,0.179851,7.6,16.1,2.8,5.6,10.6,0.159013,50.0,0.198347,...,6.000000,0.774597,15.200000,-0.998946,20.900000,-0.906983,3.0,,9.837981,Canberra
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109098,2015-06-14,0.296763,4.2,15.6,0.8,,,0.174999,37.0,0.244444,...,,,14.200000,-0.962547,17.128571,-0.727366,,-0.258199,14.544409,Watsonia
109099,2015-06-14,0.067316,13.3,16.2,0.0,3.2,7.5,0.083045,33.0,0.039773,...,2.571429,0.698430,10.557143,0.923192,17.342857,-0.025497,0.0,,3.371986,Williamtown
109100,2015-06-14,0.157641,18.9,33.2,0.0,4.0,,0.118566,28.0,0.157859,...,3.571429,0.182574,23.342857,-0.155941,30.657143,0.059689,0.0,,8.063371,Witchcliffe
109101,2015-06-14,0.153197,5.5,16.6,0.0,,,0.247560,20.0,0.126616,...,,,7.328571,0.777015,15.114286,0.188730,0.0,,7.457778,Wollongong


In [119]:
# train_prep.drop('Date', axis=1, inplace=True)
train_prep.drop('Location_str', axis=1, inplace=True)
train_X, train_y = train_prep.drop('RainTomorrow', axis=1), train_prep.RainTomorrow
fixed_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [120]:
train_X, train_y = train_prep.drop('RainTomorrow', axis=1), train_prep.RainTomorrow
lgb_params = {

    "objective": "binary",
    "learning_rate": 0.05,
    "num_threads": 10,
    "metric": "AUC",
    "seed": 42,
   "verbose":-1,
    'class_weight': 'balanced',
    
     #regularization
    "colsample_bytree": 0.7,
    "subsample": 0.8,

    "subsample_freq": 1,
    "min_data_in_leaf": 300,

    "num_leaves":10,
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X, label=train_y.fillna(0), free_raw_data=False)
result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(15), lgb.log_evaluation(15)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = result["cvbooster"].best_iteration

Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 15 rounds
[15]	cv_agg's train auc: 0.862555 + 0.000782989	cv_agg's valid auc: 0.859619 + 0.00338007
[30]	cv_agg's train auc: 0.87009 + 0.000701646	cv_agg's valid auc: 0.867158 + 0.00244312
[45]	cv_agg's train auc: 0.876265 + 0.000393783	cv_agg's valid auc: 0.872893 + 0.00261023
[60]	cv_agg's train auc: 0.8811 + 0.000409208	cv_agg's valid auc: 0.877364 + 0.00253749
[75]	cv_agg's train auc: 0.885245 + 0.000505933	cv_agg's valid auc: 0.880966 + 0.00234018
[90]	cv_agg's train auc: 0.888763 + 0.000497559	cv_agg's valid auc: 0.884004 + 0.00229734
[105]	cv_agg's train auc: 0.891792 + 0.000508048	cv_agg's valid auc: 0.886465 + 0.00233728
[120]	cv_agg's train auc: 0.894441 + 0.000514297	cv_agg's valid auc: 0.888526 + 0.00230676
[135]	cv_agg's train auc: 0.896835 + 0.000488865	cv_agg's valid auc: 0.890212 + 0.00230726
[150]	cv_agg's train auc: 0.899017 + 0.000506604	cv_agg's valid auc: 0.891786 + 0.00227571
[165]	cv_agg's train auc: 0.900951 + 0

[1365]	cv_agg's train auc: 0.95506 + 0.000315412	cv_agg's valid auc: 0.915146 + 0.00221023
[1380]	cv_agg's train auc: 0.955432 + 0.000330465	cv_agg's valid auc: 0.91521 + 0.00220347
[1395]	cv_agg's train auc: 0.955802 + 0.000331877	cv_agg's valid auc: 0.915298 + 0.00220892
[1410]	cv_agg's train auc: 0.956167 + 0.000315236	cv_agg's valid auc: 0.915405 + 0.00222527
[1425]	cv_agg's train auc: 0.956519 + 0.0003077	cv_agg's valid auc: 0.915459 + 0.00225168
[1440]	cv_agg's train auc: 0.956857 + 0.000318101	cv_agg's valid auc: 0.915561 + 0.00226452
[1455]	cv_agg's train auc: 0.957224 + 0.000306451	cv_agg's valid auc: 0.915653 + 0.00229074
[1470]	cv_agg's train auc: 0.957572 + 0.000295263	cv_agg's valid auc: 0.915746 + 0.00229983
[1485]	cv_agg's train auc: 0.95792 + 0.000284453	cv_agg's valid auc: 0.915849 + 0.00228134
[1500]	cv_agg's train auc: 0.958255 + 0.000293015	cv_agg's valid auc: 0.915904 + 0.0022824
[1515]	cv_agg's train auc: 0.958596 + 0.000293644	cv_agg's valid auc: 0.916014 + 0.002

[2715]	cv_agg's train auc: 0.978753 + 0.000208806	cv_agg's valid auc: 0.920399 + 0.00228851
[2730]	cv_agg's train auc: 0.978921 + 0.000200503	cv_agg's valid auc: 0.920416 + 0.00227129
[2745]	cv_agg's train auc: 0.9791 + 0.000186867	cv_agg's valid auc: 0.920448 + 0.00227855
[2760]	cv_agg's train auc: 0.979278 + 0.000193503	cv_agg's valid auc: 0.920511 + 0.00225816
[2775]	cv_agg's train auc: 0.979464 + 0.000191412	cv_agg's valid auc: 0.920564 + 0.00225861
[2790]	cv_agg's train auc: 0.979645 + 0.000185622	cv_agg's valid auc: 0.920599 + 0.00228555
[2805]	cv_agg's train auc: 0.979815 + 0.00019092	cv_agg's valid auc: 0.920651 + 0.00230193
[2820]	cv_agg's train auc: 0.979991 + 0.000198478	cv_agg's valid auc: 0.920699 + 0.00230112
[2835]	cv_agg's train auc: 0.980163 + 0.000200858	cv_agg's valid auc: 0.920727 + 0.00230919
[2850]	cv_agg's train auc: 0.98034 + 0.000196988	cv_agg's valid auc: 0.920766 + 0.00231914
[2865]	cv_agg's train auc: 0.980516 + 0.000192915	cv_agg's valid auc: 0.920811 + 0.0

In [123]:
train_X, train_y = train_prep.drop('RainTomorrow', axis=1), train_prep.RainTomorrow
lgb_params = {

    "objective": "binary",
    "learning_rate": 0.05,
    "num_threads": 10,
    "metric": "AUC",
    "seed": 42,
   "verbose":-1,
    'class_weight': 'balanced',
    
     #regularization
    "colsample_bytree": 0.7,
    "subsample": 0.8,

    "subsample_freq": 1,
    "min_data_in_leaf": 400,

    "num_leaves":7,
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X, label=train_y.fillna(0), free_raw_data=False)
result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(15), lgb.log_evaluation(15)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = result["cvbooster"].best_iteration

Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 15 rounds
[15]	cv_agg's train auc: 0.857029 + 0.000996535	cv_agg's valid auc: 0.85495 + 0.00398202
[30]	cv_agg's train auc: 0.864365 + 0.000622935	cv_agg's valid auc: 0.86209 + 0.00237375
[45]	cv_agg's train auc: 0.870827 + 0.000575431	cv_agg's valid auc: 0.868293 + 0.0022728
[60]	cv_agg's train auc: 0.87547 + 0.000388953	cv_agg's valid auc: 0.872596 + 0.00244203
[75]	cv_agg's train auc: 0.879561 + 0.000392204	cv_agg's valid auc: 0.876374 + 0.00244055
[90]	cv_agg's train auc: 0.882924 + 0.000386577	cv_agg's valid auc: 0.879447 + 0.00230438
[105]	cv_agg's train auc: 0.885661 + 0.00045675	cv_agg's valid auc: 0.881816 + 0.00218791
[120]	cv_agg's train auc: 0.888076 + 0.000375932	cv_agg's valid auc: 0.883845 + 0.0022024
[135]	cv_agg's train auc: 0.890251 + 0.000348242	cv_agg's valid auc: 0.885649 + 0.00223971
[150]	cv_agg's train auc: 0.892188 + 0.000386163	cv_agg's valid auc: 0.887177 + 0.00226257
[165]	cv_agg's train auc: 0.893886 + 0.00

[1365]	cv_agg's train auc: 0.939564 + 0.000218438	cv_agg's valid auc: 0.91105 + 0.00214456
[1380]	cv_agg's train auc: 0.939881 + 0.000221021	cv_agg's valid auc: 0.911148 + 0.00215827
[1395]	cv_agg's train auc: 0.940196 + 0.000216258	cv_agg's valid auc: 0.911218 + 0.00215275
[1410]	cv_agg's train auc: 0.940519 + 0.000220539	cv_agg's valid auc: 0.911327 + 0.0021583
[1425]	cv_agg's train auc: 0.940817 + 0.000246061	cv_agg's valid auc: 0.911391 + 0.00216969
[1440]	cv_agg's train auc: 0.941125 + 0.000252972	cv_agg's valid auc: 0.911468 + 0.00219124
[1455]	cv_agg's train auc: 0.941442 + 0.00026071	cv_agg's valid auc: 0.91156 + 0.00219459
[1470]	cv_agg's train auc: 0.941756 + 0.000266481	cv_agg's valid auc: 0.911656 + 0.00222442
[1485]	cv_agg's train auc: 0.942028 + 0.000250533	cv_agg's valid auc: 0.911719 + 0.00222492
[1500]	cv_agg's train auc: 0.942315 + 0.000246988	cv_agg's valid auc: 0.911768 + 0.00221969
[1515]	cv_agg's train auc: 0.942603 + 0.000263761	cv_agg's valid auc: 0.911864 + 0.0

[2715]	cv_agg's train auc: 0.961389 + 0.000213523	cv_agg's valid auc: 0.916791 + 0.00229136
[2730]	cv_agg's train auc: 0.96157 + 0.000212574	cv_agg's valid auc: 0.916834 + 0.00229093
[2745]	cv_agg's train auc: 0.961754 + 0.00019682	cv_agg's valid auc: 0.916866 + 0.00227712
[2760]	cv_agg's train auc: 0.961947 + 0.000201005	cv_agg's valid auc: 0.916916 + 0.00228502
[2775]	cv_agg's train auc: 0.962124 + 0.00020847	cv_agg's valid auc: 0.916954 + 0.00229492
[2790]	cv_agg's train auc: 0.962308 + 0.000207975	cv_agg's valid auc: 0.916993 + 0.00232611
[2805]	cv_agg's train auc: 0.962479 + 0.000216647	cv_agg's valid auc: 0.917042 + 0.00230651
[2820]	cv_agg's train auc: 0.962684 + 0.000201814	cv_agg's valid auc: 0.917079 + 0.00232329
[2835]	cv_agg's train auc: 0.962871 + 0.000206125	cv_agg's valid auc: 0.917125 + 0.00232634
[2850]	cv_agg's train auc: 0.963054 + 0.000203145	cv_agg's valid auc: 0.917154 + 0.00231071
[2865]	cv_agg's train auc: 0.963238 + 0.000207885	cv_agg's valid auc: 0.917192 + 0.

[4065]	cv_agg's train auc: 0.975189 + 0.000217811	cv_agg's valid auc: 0.919391 + 0.00230179
[4080]	cv_agg's train auc: 0.975305 + 0.000215405	cv_agg's valid auc: 0.919409 + 0.00229563
[4095]	cv_agg's train auc: 0.975429 + 0.000217403	cv_agg's valid auc: 0.919425 + 0.00230526
Early stopping, best iteration is:
[4089]	cv_agg's train auc: 0.975379 + 0.000213692	cv_agg's valid auc: 0.919433 + 0.00230683
