### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
# Import packages & set display options

import pandas as pd
import numpy as np
import sweetviz as sv
import textwrap
import matplotlib.pyplot as plt
import category_encoders as ce
import xgboost as xgb
import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


In [2]:
# Function to append true labels at specified offsets from future observations of session
# dependent on whether the future session time exists in the session or not

def prep_df(uids, df, offset, result):
    list_uids = list(uids['SESSION_UID'][uids['DURATION (MINS)'] > int(offset)])
    for i in list_uids:
        filt_df = df[(df['SESSION_UID'] == i)]
        filt_df = filt_df[(filt_df['TIME_OFFSET'] == 0) | (filt_df['TIME_OFFSET'] == int(offset))]
        filt_df['SESSION_TIME'] = filt_df['SESSION_TIME'].apply(np.floor)
        filt_df = filt_df.drop_duplicates()
        filt_df = filt_df.sort_values(by='SESSION_TIME')
        filt_df['FWD'] = filt_df['SESSION_TIME'] + (60 * int(offset))
        
        fwd = filt_df[filt_df['TIME_OFFSET'] == 0]
        fwd = fwd[['FWD', 'RAIN_PERCENTAGE','WEATHER']]
        fwd = fwd.rename(columns={'FWD':'SESSION_TIME',
                          'WEATHER': 'PRED_WEATHER',
                          'RAIN_PERCENTAGE': 'PRED_RAIN_PERCENTAGE'})
        
        curr = filt_df[filt_df['TIME_OFFSET'] == int(offset)]
        
        tot = curr.merge(fwd, on=['SESSION_TIME'], how='inner')
        tot = tot.drop(columns=['FWD', 'SESSION_TYPE', 'N_FORECASTS']) 
        result = result.append(tot)
    return result


# Function to utilise session forecast values for weather & precip as ground truth labels 

def offset_prep(offset_df, current_df):
    offset_num = int(offset_df.loc[offset_df.index[0],'TIME_OFFSET'])
    current_df = current_df.drop(columns=['SESSION_TYPE', 'TIME_OFFSET', 'N_FORECASTS', 
                                          'TRACK_TEMPERATURE', 'TRACK_TEMP_CHANGE', 
                                          'AIR_TEMPERATURE', 'AIR_TEMP_CHANGE', 
                                          'FORECAST_TRACK_TEMP', 'FORECAST_AIR_TEMP', 
                                          'FORECAST_WEATHER'])
    offset_df = offset_df.drop(columns=['SESSION_TYPE', 'N_FORECASTS'])
    offset_df = offset_df.rename(columns={col: 'PRED_' + col 
                        for col in offset_df.columns if col in ['WEATHER', 'RAIN_PERCENTAGE']})
    forecast_df = current_df.merge(offset_df, on=['SESSION_UID', 'SESSION_TIME'], how='inner')
    #forecast_df = forecast_df.drop(columns=['TIME_OFFSET'])
    forecast_df = forecast_df[['SESSION_UID', 'SESSION_TIME',  'TIME_OFFSET', 'TRACK_TEMPERATURE',
                             'TRACK_TEMP_CHANGE', 'AIR_TEMPERATURE', 'AIR_TEMP_CHANGE',
                             'FORECAST_TRACK_TEMP', 'FORECAST_AIR_TEMP', "FORECAST_WEATHER",
                             'RAIN_PERCENTAGE', 'WEATHER', 'PRED_RAIN_PERCENTAGE', 'PRED_WEATHER']]
    return forecast_df


# Function to create split dataset into x & y, with separate datasets for both weather and
# precipitation target variables

def split_targets(df, smote):
    sm = SMOTE(random_state = 2)
    prec = df.drop(columns='PRED_WEATHER')
    weather = df.drop(columns='PRED_RAIN_PERCENTAGE')
    
    prec_train_x = prec.drop(columns=['PRED_RAIN_PERCENTAGE'])
    prec_train_y = prec['PRED_RAIN_PERCENTAGE']
    
    weath_train_x = weather.drop(columns=['PRED_WEATHER'])
    weath_train_y = weather['PRED_WEATHER']
    if smote == 1:
        print('BEFORE:', Counter(weath_train_y))
        weath_train_x, weath_train_y = sm.fit_resample(weath_train_x, weath_train_y.ravel())
        print('AFTER:', Counter(weath_train_y))
    else:
        print('NO SMOTE:', Counter(weath_train_y))
    return prec_train_x, prec_train_y, weath_train_x, weath_train_y


In [3]:
# Read in dataset as .csv
df = pd.read_csv('weather.csv', index_col=False, low_memory=False)

In [4]:
# Clean column names for readibility

df.columns = df.columns.str.replace(r'^M_', '', regex=True)
# names = list(df.columns)
# print(names, end='')


In [5]:
# Drop columns based on assumptions:
# 1. Game-setting relevance only (AI difficulty, DRSASSIST, etc)
# 2. Not having enough information to incorporate into the shipped product. Columns referencing pit stop
#    windows, safety cars etc lack additional information to make use of (such as for strategy)
# 3. Time/session related identifiers (such as season/session/weekend link identifier) - which we
#    could not establish a relationship with the limited data we had available.

df1 = df.drop(columns=['PACKET_FORMAT', 'GAME_MAJOR_VERSION', 'PACKET_VERSION', 'PACKET_ID', 
                      'SECONDARY_PLAYER_CAR_INDEX', 'SLI_PRO_NATIVE_SUPPORT', 'SAFETY_CAR_STATUS', 
                      'DRSASSIST', 'STEERING_ASSIST', 'AI_DIFFICULTY', 'NETWORK_GAME', 'PIT_RELEASE_ASSIST',
                      'BRAKING_ASSIST', 'GAME_MINOR_VERSION', 'ERSASSIST', 'PIT_ASSIST', 'GEARBOX_ASSIST',
                      'DYNAMIC_RACING_LINE', 'DYNAMIC_RACING_LINE_TYPE', 'PIT_SPEED_LIMIT', 'SPECTATOR_CAR_INDEX',
                      'FRAME_IDENTIFIER', 'GAMEHOST', 'ZONE_START', 'ZONE_FLAG', 'PIT_STOP_REJOIN_POSITION',
                      'NUM_MARSHAL_ZONES', 'SEASON_LINK_IDENTIFIER', 'WEEKEND_LINK_IDENTIFIER', 
                      'SESSION_LINK_IDENTIFIER','PIT_STOP_WINDOW_LATEST_LAP', 'Unnamed: 58', 'TIMESTAMP',
                      'FORMULA', 'PLAYER_CAR_INDEX', 'TOTAL_LAPS', 'TRACK_LENGTH']) 

In [6]:
# Filtering & cleaning of data set rows based on some assumptions, such as the omission of paused packets,
# and the removal of "approximate" weather forecast settings, which, with some domain research, removes
# the dynamic nature of weather and makes it more 'predictable.'

# Removal of rows which are potentially 
# forecasting future sessions or days weather. Only leaving immediate session forecasts.
df1 = df1[(df1['NUM_WEATHER_FORECAST_SAMPLES'] != 0) & (df1['SESSION_TYPE'] != 0)] 
df1 = df1[df1['SESSION_TYPE'] == df1['WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE']] 

df1 = df1[df1['GAME_PAUSED'] == 0]

#df1 = df1[df1['FORECAST_ACCURACY'] == 0] # lose 9% of vals on duplicate drop

# Remove additional columns after row-wise filtering considerations
df_clean = df1.drop(columns=['GAME_PAUSED', 'IS_SPECTATING', 'SESSION_TIME_LEFT', 'PIT_STOP_WINDOW_IDEAL_LAP',
                             'SESSION_DURATION', 'FORECAST_ACCURACY', 'WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE',
                             'TRACK_ID'])

df_clean = df_clean.drop_duplicates()

In [7]:
# Calculate mean for duplicated SESSION_TIME & TIME_OFFSET within each SESSION_UID. 
# Aiming to achieve a row for each second, and this removes some duplicate session time/UIDs
# which have different values in the feature columns and instead calculates the mean.

orig = len(df_clean)
df_clean = df_clean.groupby(['SESSION_UID', 'SESSION_TIME', 'TIME_OFFSET']).mean().reset_index()
print('original:', orig , '\ncurrent:', len(df_clean))

original: 165562 
current: 165522


In [8]:
# Reorder data: [Session/time information | Weather information | Targets]

df_clean = df_clean[['SESSION_UID', 'SESSION_TIME', 
         'SESSION_TYPE', 'TIME_OFFSET', 'NUM_WEATHER_FORECAST_SAMPLES', 
         'TRACK_TEMPERATURE', 'TRACK_TEMPERATURE_CHANGE', 'AIR_TEMPERATURE', 'AIR_TEMPERATURE_CHANGE',
         'WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE', 'WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE', 
         'WEATHER_FORECAST_SAMPLES_M_WEATHER', 'RAIN_PERCENTAGE', 'WEATHER']]

# Rename columns for readability 
df_clean = df_clean.rename(columns={'WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE': 'FORECAST_TRACK_TEMP', 
                                    'WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE': 'FORECAST_AIR_TEMP',
                                    'WEATHER_FORECAST_SAMPLES_M_WEATHER': 'FORECAST_WEATHER',
                                    'NUM_WEATHER_FORECAST_SAMPLES': 'N_FORECASTS',
                                    'TRACK_TEMPERATURE_CHANGE': 'TRACK_TEMP_CHANGE',
                                    'AIR_TEMPERATURE_CHANGE': 'AIR_TEMP_CHANGE'})

In [9]:
# Assign catagorical datatype to relevant columns
df_clean['TRACK_TEMP_CHANGE'] = df_clean['TRACK_TEMP_CHANGE'].apply(np.floor)

df_clean = df_clean.astype({"WEATHER": "category",
                            "TRACK_TEMP_CHANGE": "category",
                            "AIR_TEMP_CHANGE": "category"})

# Experimentation conducted with One Hot Encoder & feature standardization.
# This was ultimately omitted due to negligible difference seen in the prediction
# error.
# OHE = ce.OneHotEncoder(cols=['TRACK_TEMP_CHANGE',
#                              'AIR_TEMP_CHANGE'], use_cat_names=True)

# df_clean = OHE.fit_transform(df_clean)

# df_clean[['AIR_TEMPERATURE', 'TRACK_TEMPERATURE', 'FORECAST_AIR_TEMP',
#           'FORECAST_TRACK_TEMP']] = StandardScaler().fit_transform(df_clean[['AIR_TEMPERATURE', 'TRACK_TEMPERATURE', 
#                                                                              'FORECAST_AIR_TEMP','FORECAST_TRACK_TEMP']])

In [10]:
# Establish list of all sessions, start session_times and end session_times. 
# Done in order to both better understand the sessional data, and provide a list to iterate
# through for the following steps.

top = df_clean.groupby(['SESSION_UID'], sort=False)['SESSION_TIME'].max().to_frame().reset_index()
bot = df_clean.groupby(['SESSION_UID'], sort=False)['SESSION_TIME'].min().to_frame().reset_index()

times = bot.merge(top, on=['SESSION_UID'], how='left')

times = times.rename(columns={'SESSION_TIME_x': 'MIN_TIME (SECS)', 
                              'SESSION_TIME_y': 'MAX_TIME (SECS)'})
times['DURATION (MINS)'] = (times['MAX_TIME (SECS)'] - times['MIN_TIME (SECS)'])/60

times = times.sort_values(by='DURATION (MINS)', ascending=False)

times = times.reset_index(drop=True)

#times.head(5)

In [11]:
# Utilise function to append true labels at specified offsets from future observations 
# of session. dependent on whether the future session time exists in the session or not
# Observations are omitted if no true label can be discovered within the same session.

data_5 = pd.DataFrame()
data_5 = prep_df(times, df_clean, 5, data_5)
# data_5_prec.to_csv('~/CSVs/data_5_prec.csv', index=False)
# data_5_weather.to_csv('~/CSVs/data_5_weather.csv', index=False)

data_10 = pd.DataFrame()
data_10 = prep_df(times, df_clean, 10, data_10)
# data_10_prec.to_csv('~/CSVs/data_10_prec.csv', index=False)
# data_10_weather.to_csv('~/CSVs/data_10_weather.csv', index=False)

data_15 = pd.DataFrame()
data_15 = prep_df(times, df_clean, 15, data_15)

data_30 = pd.DataFrame()
data_30 = prep_df(times, df_clean, 30, data_30)

data_60 = pd.DataFrame()
data_60 = prep_df(times, df_clean, 60, data_60)

print(len(data_5), len(data_10), len(data_15), len(data_30), len(data_60))

13534 8164 2242 1089 667


In [12]:
# Quick EDA on what degree of imbalance exists between classes and values

print(Counter(data_5['PRED_WEATHER']))
print(Counter(data_10['PRED_WEATHER']))
print(Counter(data_15['PRED_WEATHER']))
print(Counter(data_30['PRED_WEATHER']))
print(Counter(data_60['PRED_WEATHER']),'\n')

print(Counter(data_5['PRED_RAIN_PERCENTAGE']))
print(Counter(data_10['PRED_RAIN_PERCENTAGE']))
print(Counter(data_15['PRED_RAIN_PERCENTAGE']))
print(Counter(data_30['PRED_RAIN_PERCENTAGE']))
print(Counter(data_60['PRED_RAIN_PERCENTAGE']))

Counter({0.0: 9671, 1.0: 3181, 2.0: 682})
Counter({0.0: 6046, 1.0: 1685, 2.0: 433})
Counter({0.0: 1911, 1.0: 331})
Counter({0.0: 1089})
Counter({0.0: 667}) 

Counter({1.0: 3410, 3.0: 2347, 4.0: 2212, 2.0: 2025, 12.0: 989, 21.0: 682, 5.0: 553, 11.0: 373, 13.0: 358, 10.0: 328, 8.0: 127, 6.0: 88, 2.5: 22, 14.0: 20})
Counter({1.0: 2445, 4.0: 1481, 2.0: 1122, 3.0: 947, 12.0: 717, 21.0: 433, 5.0: 334, 11.0: 252, 8.0: 179, 13.0: 164, 10.0: 77, 2.5: 7, 9.0: 6})
Counter({2.0: 938, 1.0: 795, 10.0: 261, 5.0: 107, 6.0: 62, 3.0: 58, 8.0: 15, 9.0: 6})
Counter({2.0: 754, 1.0: 242, 5.0: 93})
Counter({2.0: 432, 1.0: 235})


In [13]:
# For underpopulated & imbalanced offsets (namely 15, 30 and 60), a 
# solution design choice is made to utilise the forecast values for
# weather & precipitation as a source of ground truth. The idea is
# to introduce more diversity amongst the classes at the longer time,
# in order to prevent likelihood of it being a single class classifier - 
# given that we already have a limited dataset with a lack of certain
# weather types after our filtering.

current = df_clean[(df_clean['TIME_OFFSET'] == 0)] 

offset_15 = df_clean[(df_clean['TIME_OFFSET'] == 15)]
offset_30 = df_clean[(df_clean['TIME_OFFSET'] == 30)]
offset_60 = df_clean[(df_clean['TIME_OFFSET'] == 60)]

forecast_15 = offset_prep(offset_15, current)
forecast_30 = offset_prep(offset_30, current)
forecast_60 = offset_prep(offset_60, current)

forecast_15 = forecast_15[forecast_15['PRED_WEATHER'] == 2]
data_15n = pd.concat([data_15, forecast_15])
data_30n = pd.concat([data_30, forecast_30])
data_60n = pd.concat([data_60, forecast_60])

# Quick EDA on what degree of imbalance exists between classes and values
# following on from the appendage of forecast values as a source of ground
# truth for offsets 15, 30 & 60.

print(Counter(data_5['PRED_WEATHER']))
print(Counter(data_10['PRED_WEATHER']))
print(Counter(data_15n['PRED_WEATHER']))
print(Counter(data_30n['PRED_WEATHER']))
print(Counter(data_60n['PRED_WEATHER']),'\n')

print(Counter(data_5['PRED_RAIN_PERCENTAGE']))
print(Counter(data_10['PRED_RAIN_PERCENTAGE']))
print(Counter(data_15n['PRED_RAIN_PERCENTAGE']))
print(Counter(data_30n['PRED_RAIN_PERCENTAGE']))
print(Counter(data_60n['PRED_RAIN_PERCENTAGE']))


Counter({0.0: 9671, 1.0: 3181, 2.0: 682})
Counter({0.0: 6046, 1.0: 1685, 2.0: 433})
Counter({0.0: 1911, 1.0: 331, 2.0: 51})
Counter({0.0: 8553, 1.0: 5887, 2.0: 51})
Counter({0.0: 8131, 1.0: 5887, 2.0: 51}) 

Counter({1.0: 3410, 3.0: 2347, 4.0: 2212, 2.0: 2025, 12.0: 989, 21.0: 682, 5.0: 553, 11.0: 373, 13.0: 358, 10.0: 328, 8.0: 127, 6.0: 88, 2.5: 22, 14.0: 20})
Counter({1.0: 2445, 4.0: 1481, 2.0: 1122, 3.0: 947, 12.0: 717, 21.0: 433, 5.0: 334, 11.0: 252, 8.0: 179, 13.0: 164, 10.0: 77, 2.5: 7, 9.0: 6})
Counter({2.0: 938, 1.0: 795, 10.0: 261, 5.0: 107, 6.0: 62, 3.0: 58, 16.0: 37, 8.0: 15, 24.0: 9, 9.0: 6, 13.0: 4, 14.0: 1})
Counter({14.0: 5487, 4.0: 4212, 8.0: 897, 3.0: 786, 2.0: 757, 11.0: 440, 7.0: 362, 12.0: 281, 13.0: 279, 5.0: 255, 1.0: 242, 6.0: 215, 10.0: 168, 9.0: 64, 16.0: 37, 21.0: 9})
Counter({14.0: 6450, 4.0: 4804, 8.0: 1704, 2.0: 472, 1.0: 235, 11.0: 160, 10.0: 105, 6.0: 38, 18.0: 37, 5.0: 36, 3.0: 10, 15.0: 9, 12.0: 4, 19.0: 4, 2.5: 1})


In [14]:
# This chunk creates a split in the dataframe for precipitation & weather models, and additionally 
# creates test and train dataframes for the respective precipitation & weather datasets.
# Where imbalances were identified previously and addressed through the use of future forecast
# values as a source of ground truth, we utilise SMOTE to further address the class imbalance that 
# is present in offsets 15, 30 and 60. This is all done from within the split_targets function.

print('OFFSET 5')
prec5_train_x, prec5_train_y, weath5_train_x, weath5_train_y = split_targets(data_5, smote = 0)
prec5_train_x, prec5_test_x, prec5_train_y, prec5_test_y = train_test_split(prec5_train_x, prec5_train_y, 
                                                                        test_size=0.30,random_state=0)
weath5_train_x, weath5_test_x, weath5_train_y, weath5_test_y = train_test_split(weath5_train_x, weath5_train_y, 
                                                                         test_size=0.30,random_state=0)
# #######################################################################
print('\nOFFSET 10') 
prec10_train_x, prec10_train_y, weath10_train_x, weath10_train_y = split_targets(data_10, smote = 0)
prec10_train_x, prec10_test_x, prec10_train_y, prec10_test_y = train_test_split(prec10_train_x, prec10_train_y, 
                                                                        test_size=0.30,random_state=0)
weath10_train_x, weath10_test_x, weath10_train_y, weath10_test_y = train_test_split(weath10_train_x, weath10_train_y, 
                                                                         test_size=0.30,random_state=0)

#######################################################################
print('\nOFFSET 15')
prec15_train_x, prec15_train_y, weath15_train_x, weath15_train_y = split_targets(data_15n, smote = 1)
prec15_train_x, prec15_test_x, prec15_train_y, prec15_test_y = train_test_split(prec15_train_x, prec15_train_y, 
                                                                        test_size=0.30,random_state=0)
weath15_train_x, weath15_test_x, weath15_train_y, weath15_test_y = train_test_split(weath15_train_x, weath15_train_y, 
                                                                         test_size=0.30,random_state=0)

#######################################################################
print('\nOFFSET 30') 
prec30_train_x, prec30_train_y, weath30_train_x, weath30_train_y = split_targets(data_30n, smote = 1)
prec30_train_x, prec30_test_x, prec30_train_y, prec30_test_y = train_test_split(prec30_train_x, prec30_train_y, 
                                                                        test_size=0.30,random_state=0)
weath30_train_x, weath30_test_x, weath30_train_y, weath30_test_y = train_test_split(weath30_train_x, weath30_train_y, 
                                                                         test_size=0.30,random_state=0)

#######################################################################
print('\nOFFSET 60') 
prec60_train_x, prec60_train_y, weath60_train_x, weath60_train_y = split_targets(data_60n, smote = 1)
prec60_train_x, prec60_test_x, prec60_train_y, prec60_test_y = train_test_split(prec60_train_x, prec60_train_y, 
                                                                        test_size=0.30,random_state=0)
weath60_train_x, weath60_test_x, weath60_train_y, weath60_test_y = train_test_split(weath60_train_x, weath60_train_y, 
                                                                         test_size=0.30,random_state=0)


OFFSET 5
NO SMOTE: Counter({0.0: 9671, 1.0: 3181, 2.0: 682})

OFFSET 10
NO SMOTE: Counter({0.0: 6046, 1.0: 1685, 2.0: 433})

OFFSET 15
BEFORE: Counter({0.0: 1911, 1.0: 331, 2.0: 51})
AFTER: Counter({0.0: 1911, 1.0: 1911, 2.0: 1911})

OFFSET 30
BEFORE: Counter({0.0: 8553, 1.0: 5887, 2.0: 51})
AFTER: Counter({0.0: 8553, 1.0: 8553, 2.0: 8553})

OFFSET 60
BEFORE: Counter({0.0: 8131, 1.0: 5887, 2.0: 51})
AFTER: Counter({0.0: 8131, 1.0: 8131, 2.0: 8131})


In [15]:
# Linear Regression model exploration/testing.

# model_LR = LinearRegression()

# # fit the model with the training data
# model_LR.fit(prec5_train_x, prec5_train_y)

# predict_train = model_LR.predict(prec5_train_x)
# predict_test  = model_LR.predict(prec5_test_x)

# print('RMSE on train data: ', mean_squared_error(prec5_train_y, predict_train)**(0.5))
# print('RMSE on test data: ',  mean_squared_error(prec5_test_y, predict_test)**(0.5))

# print('MAE on train data: ', mean_absolute_error(prec5_train_y, predict_train))
# print('MAE on test data: ', mean_absolute_error(prec5_test_y, predict_test))

# model_LR = LinearRegression()

# # fit the model with the training data
# model_LR.fit(weath5_train_x, weath5_train_y)

# predict_train = model_LR.predict(weath5_train_x)
# predict_test  = model_LR.predict(weath5_test_x)

# print('RMSE on train data: ', mean_squared_error(weath5_train_y, predict_train)**(0.5))
# print('RMSE on test data: ',  mean_squared_error(weath5_test_y, predict_test)**(0.5))

# print('MAE on train data: ', mean_absolute_error(weath5_train_y, predict_train))
# print('MAE on test data: ', mean_absolute_error(weath5_test_y, predict_test))

In [18]:
# Creation of models for different offset values

############################ OFFSET 5 MODELLING ############################

dmat_train = xgb.DMatrix(prec5_train_x, prec5_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(prec5_test_x, prec5_test_y, enable_categorical=True)

prec5_mdl = xgb.train({'max_depth': 3, 'eta': 0.4, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

prec5_resids = pd.DataFrame({ "Actuals":prec5_test_y, "Prediction":prec5_mdl.predict(dmat_test)})
print()

dmat_train = xgb.DMatrix(weath5_train_x, weath5_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(weath5_test_x, weath5_test_y, enable_categorical=True)

weath5_mdl = xgb.train({'max_depth': 3, 'eta': 0.5, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

pickle.dump(prec5_mdl, open("prec5_mdl.sav", 'wb'))
pickle.dump(weath5_mdl, open("weath5_mdl.sav", 'wb'))

weath5_resids = pd.DataFrame({ "Actuals":weath5_test_y, "Prediction":weath5_mdl.predict(dmat_test)})

[0]	train-mae:2.71664	test-mae:2.77235
[1]	train-mae:1.83776	test-mae:1.86492
[2]	train-mae:1.33850	test-mae:1.34981
[3]	train-mae:1.05701	test-mae:1.06011
[4]	train-mae:0.88119	test-mae:0.87769
[5]	train-mae:0.78515	test-mae:0.77815
[6]	train-mae:0.71559	test-mae:0.70629
[7]	train-mae:0.67182	test-mae:0.66054
[8]	train-mae:0.64169	test-mae:0.62837
[9]	train-mae:0.60996	test-mae:0.59861

[0]	train-mae:0.27494	test-mae:0.27624
[1]	train-mae:0.13753	test-mae:0.13819
[2]	train-mae:0.06880	test-mae:0.06913
[3]	train-mae:0.03442	test-mae:0.03458
[4]	train-mae:0.01722	test-mae:0.01730
[5]	train-mae:0.00861	test-mae:0.00865
[6]	train-mae:0.00431	test-mae:0.00433
[7]	train-mae:0.00215	test-mae:0.00217
[8]	train-mae:0.00108	test-mae:0.00108
[9]	train-mae:0.00054	test-mae:0.00054


In [19]:
############################ OFFSET 10 MODELLING ############################

dmat_train = xgb.DMatrix(prec10_train_x, prec10_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(prec10_test_x, prec10_test_y, enable_categorical=True)

prec10_mdl = xgb.train({'max_depth': 3, 'eta': 0.4, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

prec10_resids = pd.DataFrame({ "Actuals":prec10_test_y, "Prediction":prec10_mdl.predict(dmat_test)})
print()

dmat_train = xgb.DMatrix(weath10_train_x, weath10_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(weath10_test_x, weath10_test_y, enable_categorical=True)

weath10_mdl = xgb.train({'max_depth': 3, 'eta': 0.5, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

pickle.dump(prec10_mdl, open("prec10_mdl.sav", 'wb'))
pickle.dump(weath10_mdl, open("weath10_mdl.sav", 'wb'))

weath10_resids = pd.DataFrame({ "Actuals":weath10_test_y, "Prediction":weath10_mdl.predict(dmat_test)})

[0]	train-mae:2.76903	test-mae:2.79713
[1]	train-mae:1.88672	test-mae:1.90574
[2]	train-mae:1.40521	test-mae:1.41930
[3]	train-mae:1.13893	test-mae:1.14904
[4]	train-mae:0.97617	test-mae:0.99171
[5]	train-mae:0.87806	test-mae:0.88908
[6]	train-mae:0.78356	test-mae:0.78373
[7]	train-mae:0.74007	test-mae:0.73628
[8]	train-mae:0.70285	test-mae:0.70306
[9]	train-mae:0.68964	test-mae:0.69017

[0]	train-mae:0.28197	test-mae:0.28195
[1]	train-mae:0.14511	test-mae:0.14459
[2]	train-mae:0.07659	test-mae:0.07583
[3]	train-mae:0.03997	test-mae:0.03941
[4]	train-mae:0.02834	test-mae:0.02838
[5]	train-mae:0.01635	test-mae:0.01677
[6]	train-mae:0.00881	test-mae:0.00911
[7]	train-mae:0.00442	test-mae:0.00460
[8]	train-mae:0.00342	test-mae:0.00361
[9]	train-mae:0.00182	test-mae:0.00193


In [20]:
############################ OFFSET 15 MODELLING ############################

dmat_train = xgb.DMatrix(prec15_train_x, prec15_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(prec15_test_x, prec15_test_y, enable_categorical=True)

prec15_mdl = xgb.train({'max_depth': 3, 'eta': 0.4, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

prec15_resids = pd.DataFrame({ "Actuals":prec15_test_y, "Prediction":prec15_mdl.predict(dmat_test)})
print()

dmat_train = xgb.DMatrix(weath15_train_x, weath15_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(weath15_test_x, weath15_test_y, enable_categorical=True)

weath15_mdl = xgb.train({'max_depth': 3, 'eta': 0.5, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

pickle.dump(prec15_mdl, open("prec15_mdl.sav", 'wb'))
pickle.dump(weath15_mdl, open("weath15_mdl.sav", 'wb'))

weath15_resids = pd.DataFrame({ "Actuals":weath15_test_y, "Prediction":weath15_mdl.predict(dmat_test)})

[0]	train-mae:1.61809	test-mae:1.73328
[1]	train-mae:0.98818	test-mae:1.05580
[2]	train-mae:0.62101	test-mae:0.66651
[3]	train-mae:0.41499	test-mae:0.45058
[4]	train-mae:0.30380	test-mae:0.33017
[5]	train-mae:0.20482	test-mae:0.22915
[6]	train-mae:0.15499	test-mae:0.17751
[7]	train-mae:0.11985	test-mae:0.13808
[8]	train-mae:0.08909	test-mae:0.10684
[9]	train-mae:0.06812	test-mae:0.08619

[0]	train-mae:0.41926	test-mae:0.41193
[1]	train-mae:0.20979	test-mae:0.20641
[2]	train-mae:0.10497	test-mae:0.10357
[3]	train-mae:0.05253	test-mae:0.05212
[4]	train-mae:0.02628	test-mae:0.02637
[5]	train-mae:0.01315	test-mae:0.01348
[6]	train-mae:0.00658	test-mae:0.00704
[7]	train-mae:0.00329	test-mae:0.00381
[8]	train-mae:0.00165	test-mae:0.00220
[9]	train-mae:0.00082	test-mae:0.00139


In [21]:
############################ OFFSET 30 MODELLING ############################

dmat_train = xgb.DMatrix(prec30_train_x, prec30_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(prec30_test_x, prec30_test_y, enable_categorical=True)

prec30_mdl = xgb.train({'max_depth': 3, 'eta': 0.4, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

prec30_resids = pd.DataFrame({ "Actuals":prec30_test_y, "Prediction":prec30_mdl.predict(dmat_test)})
print()

dmat_train = xgb.DMatrix(weath30_train_x, weath30_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(weath30_test_x, weath30_test_y, enable_categorical=True)

weath30_mdl = xgb.train({'max_depth': 3, 'eta': 0.5, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

pickle.dump(prec30_mdl, open("prec30_mdl.sav", 'wb'))
pickle.dump(weath30_mdl, open("weath30_mdl.sav", 'wb'))

weath30_resids = pd.DataFrame({ "Actuals":weath30_test_y, "Prediction":weath30_mdl.predict(dmat_test)})

[0]	train-mae:4.89518	test-mae:4.83910
[1]	train-mae:2.95365	test-mae:2.91768
[2]	train-mae:1.81672	test-mae:1.79528
[3]	train-mae:1.13046	test-mae:1.11807
[4]	train-mae:0.72406	test-mae:0.71664
[5]	train-mae:0.46872	test-mae:0.46384
[6]	train-mae:0.32267	test-mae:0.32022
[7]	train-mae:0.23867	test-mae:0.23632
[8]	train-mae:0.17014	test-mae:0.16895
[9]	train-mae:0.13470	test-mae:0.13450

[0]	train-mae:0.41670	test-mae:0.41688
[1]	train-mae:0.20838	test-mae:0.20848
[2]	train-mae:0.10421	test-mae:0.10426
[3]	train-mae:0.05212	test-mae:0.05214
[4]	train-mae:0.02606	test-mae:0.02608
[5]	train-mae:0.01303	test-mae:0.01304
[6]	train-mae:0.00652	test-mae:0.00652
[7]	train-mae:0.00326	test-mae:0.00326
[8]	train-mae:0.00163	test-mae:0.00163
[9]	train-mae:0.00081	test-mae:0.00082


In [22]:
############################ OFFSET 60 MODELLING ############################

dmat_train = xgb.DMatrix(prec60_train_x, prec60_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(prec60_test_x, prec60_test_y, enable_categorical=True)

prec60_mdl = xgb.train({'max_depth': 3, 'eta': 0.4, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

prec60_resids = pd.DataFrame({ "Actuals":prec60_test_y, "Prediction":prec60_mdl.predict(dmat_test)})
print()

dmat_train = xgb.DMatrix(weath60_train_x, weath60_train_y, enable_categorical=True)
dmat_test = xgb.DMatrix(weath60_test_x, weath60_test_y, enable_categorical=True)

weath60_mdl = xgb.train({'max_depth': 3, 'eta': 0.5, 'objective': 'reg:squarederror', "eval_metric": "mae"},
                     dmat_train, evals=[(dmat_train, "train"), (dmat_test, "test")])

pickle.dump(prec60_mdl, open("prec60_mdl.sav", 'wb'))
pickle.dump(weath60_mdl, open("weath60_mdl.sav", 'wb'))

weath60_resids = pd.DataFrame({ "Actuals":weath60_test_y, "Prediction":weath60_mdl.predict(dmat_test)})

[0]	train-mae:5.20665	test-mae:5.12806
[1]	train-mae:3.12866	test-mae:3.08196
[2]	train-mae:1.88999	test-mae:1.86295
[3]	train-mae:1.14673	test-mae:1.13122
[4]	train-mae:0.69452	test-mae:0.68486
[5]	train-mae:0.42435	test-mae:0.41643
[6]	train-mae:0.26258	test-mae:0.25686
[7]	train-mae:0.17917	test-mae:0.17498
[8]	train-mae:0.11722	test-mae:0.11423
[9]	train-mae:0.07484	test-mae:0.07249

[0]	train-mae:0.41597	test-mae:0.41858
[1]	train-mae:0.20803	test-mae:0.20933
[2]	train-mae:0.10404	test-mae:0.10469
[3]	train-mae:0.05203	test-mae:0.05236
[4]	train-mae:0.02602	test-mae:0.02618
[5]	train-mae:0.01301	test-mae:0.01310
[6]	train-mae:0.00651	test-mae:0.00655
[7]	train-mae:0.00326	test-mae:0.00328
[8]	train-mae:0.00163	test-mae:0.00164
[9]	train-mae:0.00081	test-mae:0.00082


In [23]:
### BUILD RESIDUALS TABLE / PLOTS

prec5_resids
weath5_resids

prec10_resids
weath10_resids 

prec15_resids
weath15_resids 

prec30_resids
weath30_resids 

prec60_resids
weath60_resids 

Unnamed: 0,Actuals,Prediction
0,2.000,1.999
1,2.000,1.999
2,1.000,1.000
3,1.000,1.000
4,0.000,0.000
...,...,...
7313,2.000,1.999
7314,0.000,0.000
7315,0.000,0.000
7316,0.000,0.000


In [26]:
########################## MODEL SUBMISSION TESTING ##########################
# The proposed models are built on the assumption that the time offset data is 
# built into the dataset. That is to say, for a prediction to be made, the session
# duration MUST be scheduled to last for a minimum of five minutes. Thus, the 
# game will populate values for M_OFSET_TIME beyond 0, depending on the selected
# session duration.

# The predictions are made with session duration in mind. The prediction pipeline
# will utilise any offsets between 0 and 60 to populate the forecast fields for 
# different offsets. If certain offsets are unavailable due to the chosen length
# of the session, the offset will be populated with the largest offset values 
# available (if offset 60 is unavailable, the model for offset 60 will populate
# with forecast values from offset 30, and so on)

# Please insert desired .csv containing data regarding the prediction. Currently 
# filled with dummy dataset "weather-Copy1.csv"

df2 = pd.read_csv('weather-Copy1.csv', index_col=False, low_memory=False)

prec5_mdl =  pickle.load(open("prec5_mdl.sav", 'rb'))
weath5_mdl = pickle.load(open("weath5_mdl.sav", 'rb'))
prec10_mdl = pickle.load(open("prec10_mdl.sav", 'rb'))
weath10_mdl =pickle.load(open("weath10_mdl.sav", 'rb'))
prec15_mdl = pickle.load(open("prec15_mdl.sav", 'rb'))
weath15_mdl =pickle.load(open("weath15_mdl.sav", 'rb'))
prec30_mdl = pickle.load(open("prec30_mdl.sav", 'rb'))
weath30_mdl =pickle.load(open("weath30_mdl.sav", 'rb'))
prec60_mdl = pickle.load(open("prec60_mdl.sav", 'rb'))
weath60_mdl =pickle.load(open("weath60_mdl.sav", 'rb'))

In [28]:
df3 = df2

# Rename columns for ease of readability & for compatibility with previous
# model pipeline
df3.columns = df3.columns.str.replace(r'^M_', '', regex=True)

# Drop columns which are not relevant for the model
df3 = df3.drop(columns=['PACKET_FORMAT', 'GAME_MAJOR_VERSION', 'PACKET_VERSION', 
                        'PACKET_ID', 'SECONDARY_PLAYER_CAR_INDEX', 'SLI_PRO_NATIVE_SUPPORT', 
                        'SAFETY_CAR_STATUS', 'DRSASSIST', 'STEERING_ASSIST', 'AI_DIFFICULTY', 
                        'NETWORK_GAME', 'PIT_RELEASE_ASSIST','BRAKING_ASSIST', 
                        'GAME_MINOR_VERSION', 'ERSASSIST', 'PIT_ASSIST', 'GEARBOX_ASSIST',
                        'DYNAMIC_RACING_LINE', 'DYNAMIC_RACING_LINE_TYPE', 'PIT_SPEED_LIMIT', 
                        'SPECTATOR_CAR_INDEX', 'FRAME_IDENTIFIER', 'GAMEHOST', 'ZONE_START', 
                        'ZONE_FLAG', 'PIT_STOP_REJOIN_POSITION', 'NUM_MARSHAL_ZONES', 
                        'SEASON_LINK_IDENTIFIER', 'WEEKEND_LINK_IDENTIFIER', 
                        'SESSION_LINK_IDENTIFIER','PIT_STOP_WINDOW_LATEST_LAP', 
                        'Unnamed: 58', 'TIMESTAMP','FORMULA', 'PLAYER_CAR_INDEX', 
                        'TOTAL_LAPS', 'TRACK_LENGTH', 'PIT_STOP_WINDOW_IDEAL_LAP', 
                        'GAME_PAUSED', 'FORECAST_ACCURACY', 'TRACK_ID', 'SESSION_TYPE', 
                        'WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE', 'SESSION_DURATION', 
                        'IS_SPECTATING', 'SESSION_TIME_LEFT']) 

# Rename columns further for ease of readability & for compatibility with previous model 
# pipeline
df3 = df3.rename(columns={'WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE': 'FORECAST_TRACK_TEMP', 
                            'WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE': 'FORECAST_AIR_TEMP',
                            'WEATHER_FORECAST_SAMPLES_M_WEATHER': 'FORECAST_WEATHER',
                            'NUM_WEATHER_FORECAST_SAMPLES': 'N_FORECASTS',
                            'TRACK_TEMPERATURE_CHANGE': 'TRACK_TEMP_CHANGE',
                            'AIR_TEMPERATURE_CHANGE': 'AIR_TEMP_CHANGE'})

#df3 = df3[df3['N_FORECASTS'] > 0]
df3 = df3.drop(columns='N_FORECASTS')

# Reorder dataset [done for QA purposes]
df3 = df3[['SESSION_UID', 'SESSION_TIME', 'TIME_OFFSET', 'TRACK_TEMPERATURE', 
           'TRACK_TEMP_CHANGE', 'AIR_TEMPERATURE', 'AIR_TEMP_CHANGE', 
           'FORECAST_TRACK_TEMP', 'FORECAST_AIR_TEMP', 'FORECAST_WEATHER', 
           'RAIN_PERCENTAGE', 'WEATHER']]

In [29]:
# Takes session ID (assuming only one session ID is present in the dataset) - but
# can take alternatives below
target_session = df3['SESSION_UID'].unique() 

# takes first ranked session ID (can be modified as required) if other sessions are 
# desired
target_df = df3[df3['SESSION_UID'] == target_session[0]] 

# Locate offsets between 0 and 60 in the provided data, according to the target session
target_df = target_df[(target_df['TIME_OFFSET'] == 0) | (target_df['TIME_OFFSET'] == 5) | 
                      (target_df['TIME_OFFSET'] == 10) | (target_df['TIME_OFFSET'] == 15) |
                      (target_df['TIME_OFFSET'] == 30) | (target_df['TIME_OFFSET'] == 60)]

# Locate target/initial second in the target session 
target_df = target_df.sort_values(by=['SESSION_TIME'])
target_sec = target_df[target_df['SESSION_TIME'] == target_df['SESSION_TIME'].iloc[0]]

# Identify number of offsets available to the session - required for logic related to
# prediction pipeline
n_horizon = target_sec['TIME_OFFSET'].nunique() 

# Localize top of session and truncate prediction dataframe to isolate the rows
# relating to the target session/second
target = target_sec.head(n_horizon).sort_values(by=['TIME_OFFSET'])

# Logic for prediction pipeline. Please see explanation provided 2 chunks above.
# forecast values are populated from either the respective offset observation,
# or from the largest offset observation values available.
if n_horizon == 6:
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[5]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[5]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
elif n_horizon == 5:
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    weath60_pred = round(weath60_pred[0])
elif n_horizon == 4:
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
elif n_horizon == 3:
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
elif n_horizon == 2:
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
elif n_horizon == 1:
    print('Session Length is incompatible (<5 minutes)')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])


Session Length is incompatible (<5 minutes)


In [30]:
output = { 5: {'type': weath5_pred,  'rain_percentage': prec5_pred[0]},
          10: {'type': weath10_pred, 'rain_percentage': prec10_pred[0]},
          15: {'type': weath15_pred, 'rain_percentage': prec15_pred[0]},
          30: {'type': weath30_pred, 'rain_percentage': prec30_pred[0]},
          60: {'type': weath60_pred, 'rain_percentage': prec60_pred[0]}}

output

{5: {'type': 0, 'rain_percentage': 2.4603262},
 10: {'type': 1, 'rain_percentage': 5.6672416},
 15: {'type': 2, 'rain_percentage': 15.719559},
 30: {'type': 0, 'rain_percentage': 3.7601113},
 60: {'type': 0, 'rain_percentage': 5.0989795}}