In [1]:
import os
import holidays
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

In [2]:
DATA_DIR = '5001'

In [3]:
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), index_col=[0])
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), index_col=[0])
submission = pd.read_csv(os.path.join(DATA_DIR, 'sampleSubmission.csv'))

In [4]:
train['date'] = pd.to_datetime(train['date'], format="%d/%m/%Y %H:%M")
test['date'] = pd.to_datetime(test['date'], format="%d/%m/%Y %H:%M")

In [5]:
typhoon_dates = [
  '2017-06-12',
  '2017-07-23',
  '2017-08-23',
  '2017-08-27',
  '2017-10-15',
  '2018-09-16',
  '2018-09-17',
]

In [6]:
school_holidays_dates = [
    '2017-01-01',
    '2017-01-02',
    '2017-01-26',
    '2017-01-27',
    '2017-01-28',
    '2017-01-29',
    '2017-01-30',
    '2017-01-31',
    '2017-02-01',
    '2017-02-02',
    '2017-02-03',
    '2017-04-04',
    '2017-04-14',
    '2017-04-15',
    '2017-04-16',
    '2017-04-17',
    '2017-04-18',
    '2017-04-19',
    '2017-04-20',
    '2017-04-21',
    '2017-05-01',
    '2017-05-03',
    '2017-05-30',
    '2017-05-30',
    '2017-07-17',
    '2017-07-18',
    '2017-07-19',
    '2017-07-20',
    '2017-07-21',
    '2017-07-22',
    '2017-07-23',
    '2017-07-24',
    '2017-07-25',
    '2017-07-26',
    '2017-07-27',
    '2017-07-28',
    '2017-07-29',
    '2017-07-30',
    '2017-07-31',
    '2017-08-01',
    '2017-08-02',
    '2017-08-03',
    '2017-08-04',
    '2017-08-05',
    '2017-08-06',
    '2017-08-07',
    '2017-08-08',
    '2017-08-09',
    '2017-08-10',
    '2017-08-11',
    '2017-08-12',
    '2017-08-13',
    '2017-08-14',
    '2017-08-15',
    '2017-08-16',
    '2017-08-17',
    '2017-08-18',
    '2017-08-19',
    '2017-08-20',
    '2017-08-21',
    '2017-08-22',
    '2017-08-23',
    '2017-08-24',
    '2017-08-25',
    '2017-08-26',
    '2017-08-27',
    '2017-08-28',
    '2017-08-29',
    '2017-08-30',
    '2017-08-31',
    '2017-10-02',
    '2017-10-05',
    '2017-10-28',
    '2017-12-22',
    '2017-12-23',
    '2017-12-24',
    '2017-12-25',
    '2017-12-26',
    '2017-12-27',
    '2017-12-28',
    '2017-12-29',
    '2017-12-30',
    '2017-12-31',
    '2018-01-01',
    '2018-02-13',
    '2018-02-14',
    '2018-02-15',
    '2018-02-16',
    '2018-02-17',
    '2018-02-18',
    '2018-02-19',
    '2018-02-20',
    '2018-02-21',
    '2018-02-22',
    '2018-02-23',
    '2018-02-24',
    '2018-03-29',
    '2018-03-30',
    '2018-03-31',
    '2018-04-01',
    '2018-04-02',
    '2018-04-03',
    '2018-04-04',
    '2018-04-05',
    '2018-04-06',
    '2018-04-07',
    '2018-05-01',
    '2018-05-22',
    '2018-06-18',
    '2018-07-02',
    '2018-07-13',
    '2018-07-14',
    '2018-07-15',
    '2018-07-16',
    '2018-07-17',
    '2018-07-18',
    '2018-07-19',
    '2018-07-20',
    '2018-07-21',
    '2018-07-22',
    '2018-07-23',
    '2018-07-24',
    '2018-07-25',
    '2018-07-26',
    '2018-07-27',
    '2018-07-28',
    '2018-07-29',
    '2018-07-30',
    '2018-07-31',
    '2018-08-01',
    '2018-08-02',
    '2018-08-03',
    '2018-08-04',
    '2018-08-05',
    '2018-08-06',
    '2018-08-07',
    '2018-08-08',
    '2018-08-09',
    '2018-08-10',
    '2018-08-11',
    '2018-08-12',
    '2018-08-13',
    '2018-08-14',
    '2018-08-15',
    '2018-08-16',
    '2018-08-17',
    '2018-08-18',
    '2018-08-19',
    '2018-08-20',
    '2018-08-21',
    '2018-08-22',
    '2018-08-23',
    '2018-08-24',
    '2018-08-25',
    '2018-08-26',
    '2018-08-27',
    '2018-08-28',
    '2018-08-29',
    '2018-08-30',
    '2018-08-31',
    '2018-09-01',
    '2018-09-02',
    '2018-09-25',
    '2018-10-01',
    '2018-10-17',
    '2018-12-24',
    '2018-12-25',
    '2018-12-26',
    '2018-12-27',
    '2018-12-28',
    '2018-12-29',
    '2018-12-30',
    '2018-12-31',
]

In [7]:
# heavy rain dates taken from: https://www.hko.gov.hk/tc/wxinfo/climat/warndb/warndb3.shtml?opt=3&rcolor=3&start_ym=201612&end_ym=201901&submit=%E6%90%9C%E5%B0%8B
heavy_rain_datetime = [
    {'start': '2017/05/24 09:15', 'end': '2017/05/24 12:30'},
    {'start': '2017/06/13 08:45', 'end': '2017/06/13 11:50'},
    {'start': '2017/06/17 02:25', 'end': '2017/06/17 04:05'},
    {'start': '2017/07/17 20:45', 'end': '2017/07/17 22:30'},
    {'start': '2017/08/03 05:30', 'end': '2017/08/03 07:05'},
    {'start': '2018/06/08 11:30', 'end': '2018/06/08 12:30'},
    {'start': '2018/08/26 23:05', 'end': '2018/08/27 00:45'},
    {'start': '2018/08/29 18:40', 'end': '2018/08/29 21:50'},
    {'start': '2018/09/16 10:55', 'end': '2018/09/16 18:50'},
]

In [8]:
typhoons = [datetime.strptime(d, "%Y-%m-%d").date() for d in typhoon_dates]
school_holidays = [datetime.strptime(d, "%Y-%m-%d").date() for d in school_holidays_dates]

In [9]:
heavy_rains = [{'start': datetime.strptime(d['start'], "%Y/%m/%d %H:%M"), 'end': datetime.strptime(d['end'], "%Y/%m/%d %H:%M")} for d in heavy_rain_datetime]

In [10]:
holiday = holidays.HK()

In [11]:
def create_time_features(df):
    """
    create time related features
    """
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.isocalendar().week
    df['weekofyear'] = df['weekofyear'].astype(int)

    return df

train = create_time_features(train)
test = create_time_features(test)

In [12]:
def is_heavy_rain(row):
    date = row.date
    for record in heavy_rains:
        if record['start'] <= date <= record['end']:
            return 1
    return 0

In [13]:
def get_event_features(df):
    """
    create holiday, typhoon and school holiday features
    """
    # holiday feature
    dates = list(df['date'])
    holidays = []
    for date in dates:
        holidays.append(int(date in holiday))
    df['holiday']=holidays

    # typhoon feature
    df['typhoon'] = df.apply(lambda x: int(x.date in typhoons), axis=1)

    # school holiday feature
    df['school_holiday'] = df.apply(lambda x: int(x.date in school_holidays), axis=1)

    # heavy rain feature
    df['heavy_rain'] = df.apply(is_heavy_rain, axis=1)

    return df

train = get_event_features(train)
test = get_event_features(test)

In [14]:
df = train.append(test).sort_values("date")

In [15]:
def get_stats_features(row):
    """
    Create statstical features:
    1. same dayofweek and hour mean
    2. same dayofweek and hour std
    3. same dayofweek and last hour mean
    4. same dayofweek and next hour mean
    5. last hour value
    6. next hour value
    """    

    hour_mean = df.speed[
        (df.dayofweek==row.dayofweek)&
        (df.hour==row.hour)&
        (df.date<row.date)
    ].mean()

    hour_std = df.speed[
        (df.dayofweek==row.dayofweek)&
        (df.hour==row.hour)&
        (df.date<row.date)
    ].std()

    last_hour_mean = df.speed[
        (df.dayofweek == row.dayofweek)&
        (df.hour == (row.date - timedelta(hours=1)).hour)&
        (df.date<row.date)
    ].mean()

    next_hour_mean = df.speed[
        (df.dayofweek == row.dayofweek)&
        (df.hour == (row.date + timedelta(hours=1)).hour)&
        (df.date<row.date)
    ].mean()
    
    if row.date-timedelta(hours=1) in df.date.values:
        last_hour_value = df.speed[df.date==row.date-timedelta(hours=1)].mean()
    else:
        last_hour_value = np.nan
    
    if row.date+timedelta(hours=1) in df.date.values:
        next_hour_value = df.speed[df.date==row.date+timedelta(hours=1)].mean()
    else:
        next_hour_value = np.nan

    
    mean_diff_from_last_hour = last_hour_mean - hour_mean
    mean_diff_from_next_hour = next_hour_mean - hour_mean

    percentage_change_from_last_hour = (hour_mean - last_hour_mean)/last_hour_mean
    percentage_change_from_next_hour = (next_hour_mean - hour_mean)/hour_mean

    return hour_mean, hour_std, last_hour_mean, next_hour_mean, last_hour_value, next_hour_value, mean_diff_from_last_hour, mean_diff_from_next_hour, percentage_change_from_last_hour, percentage_change_from_next_hour
    

In [16]:
train[[
    'hour_mean', 
    'hour_std', 
    'last_hour_mean', 
    'next_hour_mean', 
    'last_hour_value', 
    'next_hour_value', 
    'mean_diff_from_last_hour', 
    'mean_diff_from_next_hour', 
    'percentage_change_from_last_hour', 
    'percentage_change_from_next_hour'
]] = train.apply(get_stats_features, axis=1, result_type="expand")
test[[
    'hour_mean', 
    'hour_std', 
    'last_hour_mean', 
    'next_hour_mean', 
    'last_hour_value', 
    'next_hour_value', 
    'mean_diff_from_last_hour', 
    'mean_diff_from_next_hour', 
    'percentage_change_from_last_hour', 
    'percentage_change_from_next_hour'
]] = test.apply(get_stats_features, axis=1, result_type="expand")

In [17]:
temperature_2017 = pd.read_csv(os.path.join(os.path.join(DATA_DIR, 'external data'), 'CLMTEMP_KLT_2017.csv'))
temperature_2018 = pd.read_csv(os.path.join(os.path.join(DATA_DIR, 'external data'), 'CLMTEMP_KLT_2018.csv'))

In [18]:
train['temperature'] = train.apply(
    lambda x: temperature_2017.temperature[(temperature_2017.month==x.month)&(temperature_2017.day==x.date.day)].mean() if x.year==2017 else temperature_2018.temperature[(temperature_2018.month==x.month)&(temperature_2018.day==x.date.day)].mean(), axis=1
)

test['temperature'] = test.apply(
    lambda x: temperature_2017.temperature[(temperature_2017.month==x.month)&(temperature_2017.day==x.date.day)].mean() if x.year==2017 else temperature_2018.temperature[(temperature_2018.month==x.month)&(temperature_2018.day==x.date.day)].mean(), axis=1
)

In [19]:
backup_train = train.copy()
backup_test = test.copy()

In [20]:
train = backup_train.copy()
test = backup_test.copy()

In [21]:
train = train[train.date>pd.to_datetime("2017-02-01")]

In [22]:
train["hour_std"].fillna(0, inplace=True)
test["hour_std"].fillna(0, inplace=True)

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
categorical_features = ["hour", "dayofweek", "quarter", "month", "year", "dayofyear", "dayofmonth", "weekofyear", "holiday", "typhoon", "school_holiday", "heavy_rain"]

In [25]:
numerical_features = list(set(train.columns) - {"date", "speed"} - set(categorical_features))
numerical_features

['hour_std',
 'mean_diff_from_last_hour',
 'percentage_change_from_last_hour',
 'percentage_change_from_next_hour',
 'last_hour_value',
 'hour_mean',
 'mean_diff_from_next_hour',
 'last_hour_mean',
 'next_hour_mean',
 'temperature',
 'next_hour_value']

In [26]:
ss = StandardScaler()
train[numerical_features] = ss.fit_transform(
    train[numerical_features]
)

In [27]:
test[numerical_features] = ss.transform(
    test[numerical_features]
)

In [28]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [29]:
y = train['speed']

In [30]:
X = train[list(set(train.columns) - {'date', 'speed', 'temperature'})]

In [31]:
import lightgbm as lgb

In [32]:
lgb_model = lgb.LGBMRegressor(objective="regression", boosting="gbdt")

cv_split = ShuffleSplit(n_splits=5, train_size=0.8, test_size=0.2, random_state=42)

# parameters are eliminated by gridsearch
param_grid = {
    'n_estimators': [100],
    'colsample_bytree': [0.7],
    'max_depth': [6],
    'num_leaves': [50],
    'reg_alpha': [0.3],
    'reg_lambda': [0.3],
}

gs = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid, 
    cv=cv_split, 
    n_jobs=-1, 
    scoring='neg_mean_squared_error',
    verbose=2
)
fitted_model = gs.fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    1.2s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished


In [33]:
fitted_model.best_score_

-9.509334186235957

In [34]:
fitted_model.best_params_

{'colsample_bytree': 0.7,
 'max_depth': 6,
 'n_estimators': 100,
 'num_leaves': 50,
 'reg_alpha': 0.3,
 'reg_lambda': 0.3}

In [35]:
test['speed'] = fitted_model.predict(test[list(set(train.columns) - {'date', 'speed', 'temperature'})])

In [36]:
test['id'] = test.index

In [37]:
test[['id', 'speed']].head()

Unnamed: 0_level_0,id,speed
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,47.797558
1,1,47.356355
2,2,40.079177
3,3,31.639457
4,4,41.816361


In [38]:
test[['id', 'speed']].to_csv(os.path.join(DATA_DIR, 'lightgbm.csv'), index=False)