In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from geopy.geocoders import Nominatim
import folium
from folium.plugins import HeatMap
import re
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from pandas_profiling import ProfileReport
from category_encoders.target_encoder import TargetEncoder
import umap
from sklearn.manifold import TSNE
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import shap
import pickle

  from pandas_profiling import ProfileReport


In [2]:
dataset = pd.read_csv('data/weatherAUS.csv')

In [3]:
dataset.sort_values(['Date', 'Location'], inplace=True)
dataset.reset_index(drop=True, inplace=True)

In [4]:
train, test = dataset.iloc[:109103], dataset.iloc[109103:]

In [7]:
numerical = [column for column in train.columns if train.dtypes[column] == float]

In [10]:
class OutlierDetection:
    def __init__(self):
        pass
    
    def z_score(self, X, threshold=3):
        numerical = [column for column in train.columns if train.dtypes[column] == float]
        outliers = {}
        for col in numerical:
            series_ = X[col].dropna()
            mean = np.mean(series_)
            std = np.std(series_)
            outliers_ = series_.apply(lambda x: (x-mean)/std > threshold)
            outliers[col] = series_[outliers_].index.values
        return outliers
    
#     #possible outliers
#     def frequency(self, X, first_n=2, return_counts=False):
#         categorical = [column for column in train.columns if train.dtypes[column] == object]
#         outliers = {}
#         counts = {}
#         for col in categorical:
#             outliers_ = X[col].value_counts().sort_values(ascending=True)
#             outliers[col] = outliers_.index[:first_n].values
#             if return_counts:
#                 counts[col] = outliers_[:first_n]
#         if return_counts:
#             return outliers, counts
#         else:
#             return outliers

In [42]:
detector = OutlierDetection()
outliers = detector.z_score(train, threshold=3.0)

In [43]:
outlier_indeces = set()
for k in outliers.keys():
    outlier_indeces.update(outliers[k])

In [44]:
train.loc[list(outlier_indeces), 'RainTomorrow'].value_counts()

No     2253
Yes    2130
Name: RainTomorrow, dtype: int64

In [82]:
class Preprocessor:
    def __init__(self, numerical_impute_strategy, categorical_impute_strategy, roll_cols, roll_strategies, roll_period):
        if len(roll_cols) != len(roll_strategies):
            raise ValueError('Value Error: len(roll_cols) != len(roll_strategies).')
        self.locations = None
        self.numerical_impute_strategy = numerical_impute_strategy
        self.categorical_impute_strategy = categorical_impute_strategy
        self.roll_cols = roll_cols
        self.roll_strategies = roll_strategies
        self.roll_period = roll_period

    def preprocess(self, _data, visualize=False):
        data = _data.copy(deep=True)
        
        detector = OutlierDetection()
        outliers = detector.z_score(data)
        outlier_indeces = set()
        for k in outliers.keys():
            outlier_indeces.update(outliers[k])
        
        data['is_outlier'] = 0
        data.loc[list(outlier_indeces), 'is_outlier'] = 1

        for column in ['RainToday', 'RainTomorrow']:
            data.loc[data[column] == 'Yes', column] = 1
            data.loc[data[column] == 'No', column] = 0

        data['Year'] = pd.DatetimeIndex(data['Date']).year
        data['Month'] = pd.DatetimeIndex(data['Date']).month

        if not visualize:
            self.impute(data)

        data['Location'] = data['Location'].apply(lambda loc: ' '.join(re.findall('[A-Z][^A-Z]+|[A-Z]+', loc)))
        data.loc[data.Location == 'Portland', 'Location'] = 'Portland, Victoria'
        data.loc[data.Location == 'Dartmoor', 'Location'] = 'Dartmoor, Victoria'
        data.loc[data.Location == 'Perth', 'Location'] = 'Perth, Western Australia'
        data.loc[data.Location == 'Richmond', 'Location'] = 'Richmond, New South Wales'
        data['Location'] = data.Location + ', Australia'

        if self.locations is None:
            geolocator = Nominatim(user_agent="rain-in-australia-app")
            locations = {'Location_reduced': [], 'Location': [], 'Address': [], 'Latitude': [], 'Longitude': []}
            for location in data.Location.unique().tolist() + ['Australia']:
                location_enc = geolocator.geocode(location, language='en')
                if location_enc is None:
                    raise ValueError(f'Location not found: {location}')
                locations['Location_reduced'] += [location.split(', ')[0]]
                locations['Location'] += [location]
                locations['Address'] += [location_enc.address]
                locations['Latitude'] += [location_enc.latitude]
                locations['Longitude'] += [location_enc.longitude]
            self.locations = pd.DataFrame(locations)
        data = data.merge(self.locations[['Location', 'Latitude', 'Longitude']], left_on='Location',
                          right_on='Location')

        if visualize:
            data['Location'] = data['Location'].apply(lambda loc: loc.split(', ')[0])

        data = self.RainToday_Locations(data)

        if not visualize:
            data = self.target_encoding(data)

        for i, col in enumerate(self.roll_cols):
            data[f'{col}_{self.roll_period}days'] = Preprocessor.rolling_features_for_all_locations(data[col],
                                                                                                    data.Location,
                                                                                                    period=self.roll_period,
                                                                                                    shift=1,
                                                                                                    strategy=
                                                                                                    self.roll_strategies[
                                                                                                        i])

        acc_col = [column for column in data.columns if 'RainToday_' in column and 'days' not in column or column == 'Month_Location']
        data['Accumulated_probabilities'] = data.loc[:, acc_col].sum(axis=1)
        if not visualize:
            data = self.__relevant_locations(data=data, _data=_data)
        return data
    
    def __relevant_locations(self, data, _data):
        
        def cond(x):
            return x[1] > 0 and ('RainToday_' in x[0] and 'days' not in x[0])
        
        with open('relevant_locations.pickle', 'rb') as handle:
            relevant_locations = pickle.load(handle)
        
        # Top 5 locations with highest shap-values for each location:
        _relevant_locations = {}
        for loc in relevant_locations.keys():
            l = list(relevant_locations[loc][np.apply_along_axis(cond, 1, relevant_locations[loc])])
            l.sort(key=lambda x: x[1], reverse=True)
            _relevant_locations[loc] = np.array(l)[:5, 0]
                
        data['Relevant_locations_probabilities'] = None
        for loc in _relevant_locations.keys():
            data.loc[_data.Location == loc, 'Relevant_locations_probabilities'] = data.loc[_data.Location == loc, _relevant_locations[loc]].sum(axis=1)
        
        raintodaycols = [column for column in data.columns if 'RainToday_' in column and 'days' not in column]
        train_raintoday = self.preprocess(_data, visualize=True)[raintodaycols]
        closest_locations = {}
        for loc in relevant_locations.keys():
            selected = train_raintoday.loc[(_data.Location == loc) & (_data.RainTomorrow == 'Yes'), raintodaycols].sum(axis=0).sort_values(ascending=False).index[:7]
            closest_locations[loc] = selected
        
        data['Closest_locations_probabilities'] = None
        for loc in closest_locations.keys():
            data.loc[_data.Location == loc, 'Closest_locations_probabilities'] = data.loc[_data.Location == loc, closest_locations[loc]].sum(axis=1)
        
        data['Closest_locations_probabilities'] = data['Closest_locations_probabilities'].astype(float)
        data['Relevant_locations_probabilities'] = data['Relevant_locations_probabilities'].astype(float)
        
        return data


    def impute(self, data):
        columns = data.columns[2:-2]
        dtypes = data.dtypes[2:-2]
        numerical = [column for idx, column in enumerate(columns) if dtypes[idx] == float]
        categorical = [column for idx, column in enumerate(columns) if dtypes[idx] == object]
        if self.numerical_impute_strategy == 'mean':
            data[numerical] = data.groupby(['Month', 'Location'])[numerical].transform(lambda x: x.fillna(x.mean()))
        elif self.numerical_impute_strategy == 'median':
            data[numerical] = data.groupby(['Month', 'Location'])[numerical].transform(lambda x: x.fillna(x.median()))
        else:
            raise ValueError('Wrong numerical impute strategy.')

        if self.categorical_impute_strategy == 'mode':
            data[categorical] = data.groupby(['Month', 'Location'])[categorical].transform(lambda x: x.fillna(x.mode()))
        else:
            raise ValueError('Wrong categorical impute strategy.')

    def RainToday_Locations(self, data):
        grouped = data.groupby('Date')[['Location', 'RainToday']].apply(lambda r: r.set_index('Location').T)
        grouped = grouped.reset_index().drop('level_1', axis=1)
        if ',' in grouped.columns[1]:
            grouped.columns = ['Date'] + ['RainToday_' + col[:col.index(',')] for col in grouped.columns[1:]]
        else:
            grouped.columns = ['Date'] + ['RainToday_' + col for col in grouped.columns[1:]]
        return data.merge(grouped, left_on='Date', right_on='Date')

    def target_encoding(self, _data):
        data = _data.copy(deep=True)
        data['Month_Location'] = data['Month']
        location_cols = [column for column in data.columns if 'RainToday_' in column] + ['WindGustDir', 'WindDir9am',
                                                                                         'WindDir3pm', 'Month_Location']

        for location in data.Location.unique():
            location_encoder = TargetEncoder(cols=location_cols, handle_missing=0)
            location_df = data.loc[data.Location == location]
            data.loc[data.Location == location] = location_encoder.fit_transform(location_df, location_df.RainTomorrow)

        location_encoder = TargetEncoder(cols=['Location', 'Month'])
        encoded = location_encoder.fit_transform(data, data.RainTomorrow)

        for c in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
            encoded[c] = encoded[c].astype(float)
        return encoded

    @staticmethod
    def rolling_features_for_all_locations(series_, locations, period, shift, strategy):

        def rolling_features_for_location(series_, period, shift, strategy):
            first_n = []
            for i in range(period):
                if i == 0:
                    first_n += [series_[i]]
                else:
                    if strategy == 'mean':
                        first_n += [series_[:i + 1 - shift].mean()]
                    elif strategy == 'median':
                        first_n += [series_[:i + 1 - shift].median()]
                    elif strategy == 'sum':
                        first_n += [series_[:i + 1 - shift].sum()]
                    else:
                        raise ValueError('Wrong strategy.')

            if strategy == 'mean':
                new_series = series_.shift(shift).rolling(period).mean()
            elif strategy == 'median':
                new_series = series_.shift(shift).rolling(period).median()
            elif strategy == 'sum':
                new_series = series_.shift(shift).rolling(period).sum()
            else:
                raise ValueError('Wrong strategy.')

            new_series[:period] = first_n
            return new_series

        all_locations = []
        for location in locations.unique():
            one_location, index = series_[locations == location].reset_index(drop=True, inplace=False), series_[
                locations == location].index.to_series()
            new_series_for_location = rolling_features_for_location(one_location, period, shift, strategy)
            new_series_for_location.index = index
            all_locations += [new_series_for_location]
        return pd.concat(all_locations, axis=0)

    @staticmethod
    def load_and_split():
        dataset = pd.read_csv('data/weatherAUS.csv')
        dataset.sort_values(['Date', 'Location'], inplace=True)
        dataset.reset_index(drop=True, inplace=True)
        train, test = dataset.iloc[:109103], dataset.iloc[109103:]
        return train, test


In [83]:
standard_roll_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
                      'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
                      'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am',
                      'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday']
standard_roll_strategies = ['mean'] * (len(standard_roll_cols) - 1) + ['sum']
standard_roll_period = 7

In [84]:
preprocessor = Preprocessor('median', 'mode', standard_roll_cols, standard_roll_strategies, standard_roll_period)
train, test = preprocessor.load_and_split()
train_prep = preprocessor.preprocess(train, visualize=False)

Mean of empty slice
Mean of empty slice


In [87]:
raintodaycols = [column for column in train_prep.columns if 'RainToday_' in column and 'days' not in column]
train_prep = train_prep.drop(raintodaycols, axis=1)

In [90]:
train_prep.drop('Date', axis=1, inplace=True)
train_X, train_y = train_prep.drop('RainTomorrow', axis=1), train_prep.RainTomorrow
fixed_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [91]:
lgb_params = {

    "objective": "binary",
    "learning_rate": 0.05,
    "num_threads": 10,
    "metric": "AUC",
    "seed": 42,
   "verbose":-1,
    "class_weight": 'balanced',
    
     #regularization
    "colsample_bytree": 0.6,
    "subsample": 0.8,

    "subsample_freq": 1,
    "min_data_in_leaf": 300,

    "num_leaves":10,
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X, label=train_y.fillna(0), free_raw_data=False)
result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(15), lgb.log_evaluation(15)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = result["cvbooster"].best_iteration

Found `n_estimators` in params. Will use it instead of argument


Training until validation scores don't improve for 15 rounds
[15]	cv_agg's train auc: 0.862568 + 0.000484577	cv_agg's valid auc: 0.85999 + 0.00314946
[30]	cv_agg's train auc: 0.869261 + 0.000560715	cv_agg's valid auc: 0.866452 + 0.00284573
[45]	cv_agg's train auc: 0.875386 + 0.000479688	cv_agg's valid auc: 0.872191 + 0.00271171
[60]	cv_agg's train auc: 0.879677 + 0.000501246	cv_agg's valid auc: 0.87617 + 0.002485
[75]	cv_agg's train auc: 0.883302 + 0.000449105	cv_agg's valid auc: 0.879502 + 0.0025331
[90]	cv_agg's train auc: 0.886566 + 0.000390567	cv_agg's valid auc: 0.882426 + 0.00256822
[105]	cv_agg's train auc: 0.889097 + 0.000396595	cv_agg's valid auc: 0.884639 + 0.00256166
[120]	cv_agg's train auc: 0.89112 + 0.000439255	cv_agg's valid auc: 0.886264 + 0.00245596
[135]	cv_agg's train auc: 0.892837 + 0.00046287	cv_agg's valid auc: 0.887666 + 0.00239201
[150]	cv_agg's train auc: 0.894342 + 0.000452479	cv_agg's valid auc: 0.888874 + 0.00239297
[165]	cv_agg's train auc: 0.895604 + 0.000

[1365]	cv_agg's train auc: 0.932779 + 0.000482478	cv_agg's valid auc: 0.904227 + 0.00209373
[1380]	cv_agg's train auc: 0.93306 + 0.000464325	cv_agg's valid auc: 0.904265 + 0.00209926
[1395]	cv_agg's train auc: 0.933353 + 0.000465648	cv_agg's valid auc: 0.904313 + 0.00210579
[1410]	cv_agg's train auc: 0.933649 + 0.000456441	cv_agg's valid auc: 0.904353 + 0.0021244
[1425]	cv_agg's train auc: 0.933923 + 0.000459408	cv_agg's valid auc: 0.904397 + 0.00214363
[1440]	cv_agg's train auc: 0.934179 + 0.000445413	cv_agg's valid auc: 0.904422 + 0.00215086
[1455]	cv_agg's train auc: 0.934455 + 0.000445095	cv_agg's valid auc: 0.904427 + 0.00215028
[1470]	cv_agg's train auc: 0.934759 + 0.000429006	cv_agg's valid auc: 0.904478 + 0.00214985
[1485]	cv_agg's train auc: 0.935028 + 0.000432734	cv_agg's valid auc: 0.904508 + 0.00214636
[1500]	cv_agg's train auc: 0.935294 + 0.000415123	cv_agg's valid auc: 0.904557 + 0.00216972
[1515]	cv_agg's train auc: 0.93555 + 0.000415075	cv_agg's valid auc: 0.904587 + 0.