
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
### HACKMAKERS FORMULA AI HACKATHON 2022

### JUPYTER NOTEBOOK SUBMISSION FOR TEAM 57B:
### ALEC ZHANG, DAVID ZHOU, DILAN DE SILVA,
### THOMAS CRAWLEY, ZOHAIB QAZI

### NOTEBOOK IS CREATED FOR CONDUCTING PREDICTIONS
### ALSO FOUND IN FINAL 3 CHUNKS OF Untitled.ipynb

# Import dependencies & set display options

import pandas as pd
import numpy as np
import sweetviz as sv
import textwrap
import matplotlib.pyplot as plt
import category_encoders as ce
import xgboost as xgb
import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
########################## MODEL SUBMISSION TESTING ##########################
# The proposed models are built on the assumption that the time offset data is 
# built into the dataset. That is to say, for a prediction to be made, the session
# duration MUST be scheduled to last for a minimum of five minutes. Thus, the 
# game will populate values for M_OFSET_TIME beyond 0, depending on the selected
# session duration.

# The predictions are made with session duration in mind. The prediction pipeline
# will utilise any offsets between 0 and 60 to populate the forecast fields for 
# different offsets. If certain offsets are unavailable due to the chosen length
# of the session, the offset will be populated with the largest offset values 
# available (if offset 60 is unavailable, the model for offset 60 will populate
# with forecast values from offset 30, and so on)

# Please insert desired .csv containing data regarding the prediction. Currently 
# filled with dummy dataset "weather-Copy1.csv"

df2 = pd.read_csv('~/weather-Copy1.csv', index_col=False, low_memory=False)

# Load all pickled models for different offset horizons and labels 
prec5_mdl =  pickle.load(open("models/prec5_mdl.sav", 'rb'))
weath5_mdl = pickle.load(open("models/weath5_mdl.sav", 'rb'))
prec10_mdl = pickle.load(open("models/prec10_mdl.sav", 'rb'))
weath10_mdl =pickle.load(open("models/weath10_mdl.sav", 'rb'))
prec15_mdl = pickle.load(open("models/prec15_mdl.sav", 'rb'))
weath15_mdl =pickle.load(open("models/weath15_mdl.sav", 'rb'))
prec30_mdl = pickle.load(open("models/prec30_mdl.sav", 'rb'))
weath30_mdl =pickle.load(open("models/weath30_mdl.sav", 'rb'))
prec60_mdl = pickle.load(open("models/prec60_mdl.sav", 'rb'))
weath60_mdl =pickle.load(open("models/weath60_mdl.sav", 'rb'))

In [3]:
df3 = df2

# Rename columns for ease of readability & for compatibility with previous
# model pipeline
df3.columns = df3.columns.str.replace(r'^M_', '', regex=True)

# Drop columns which are not relevant for the model
df3 = df3.drop(columns=['PACKET_FORMAT', 'GAME_MAJOR_VERSION', 'PACKET_VERSION', 
                        'PACKET_ID', 'SECONDARY_PLAYER_CAR_INDEX', 'SLI_PRO_NATIVE_SUPPORT', 
                        'SAFETY_CAR_STATUS', 'DRSASSIST', 'STEERING_ASSIST', 'AI_DIFFICULTY', 
                        'NETWORK_GAME', 'PIT_RELEASE_ASSIST','BRAKING_ASSIST', 
                        'GAME_MINOR_VERSION', 'ERSASSIST', 'PIT_ASSIST', 'GEARBOX_ASSIST',
                        'DYNAMIC_RACING_LINE', 'DYNAMIC_RACING_LINE_TYPE', 'PIT_SPEED_LIMIT', 
                        'SPECTATOR_CAR_INDEX', 'FRAME_IDENTIFIER', 'GAMEHOST', 'ZONE_START', 
                        'ZONE_FLAG', 'PIT_STOP_REJOIN_POSITION', 'NUM_MARSHAL_ZONES', 
                        'SEASON_LINK_IDENTIFIER', 'WEEKEND_LINK_IDENTIFIER', 
                        'SESSION_LINK_IDENTIFIER','PIT_STOP_WINDOW_LATEST_LAP', 
                        'Unnamed: 58', 'TIMESTAMP','FORMULA', 'PLAYER_CAR_INDEX', 
                        'TOTAL_LAPS', 'TRACK_LENGTH', 'PIT_STOP_WINDOW_IDEAL_LAP', 
                        'GAME_PAUSED', 'FORECAST_ACCURACY', 'TRACK_ID', 'SESSION_TYPE', 
                        'WEATHER_FORECAST_SAMPLES_M_SESSION_TYPE', 'SESSION_DURATION', 
                        'IS_SPECTATING', 'SESSION_TIME_LEFT']) 

# Rename columns further for ease of readability & for compatibility with previous model 
# pipeline
df3 = df3.rename(columns={'WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE': 'FORECAST_TRACK_TEMP', 
                            'WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE': 'FORECAST_AIR_TEMP',
                            'WEATHER_FORECAST_SAMPLES_M_WEATHER': 'FORECAST_WEATHER',
                            'NUM_WEATHER_FORECAST_SAMPLES': 'N_FORECASTS',
                            'TRACK_TEMPERATURE_CHANGE': 'TRACK_TEMP_CHANGE',
                            'AIR_TEMPERATURE_CHANGE': 'AIR_TEMP_CHANGE'})

#df3 = df3[df3['N_FORECASTS'] > 0]
df3 = df3.drop(columns='N_FORECASTS')

# Reorder dataset [done for QA purposes]
df3 = df3[['SESSION_UID', 'SESSION_TIME', 'TIME_OFFSET', 'TRACK_TEMPERATURE', 
           'TRACK_TEMP_CHANGE', 'AIR_TEMPERATURE', 'AIR_TEMP_CHANGE', 
           'FORECAST_TRACK_TEMP', 'FORECAST_AIR_TEMP', 'FORECAST_WEATHER', 
           'RAIN_PERCENTAGE', 'WEATHER']]

In [4]:
# Takes session ID (assuming only one session ID is present in the dataset) - but
# can take alternatives below
target_session = df3['SESSION_UID'].unique() 

# takes first ranked session ID (can be modified as required) if other sessions are 
# desired
target_df = df3[df3['SESSION_UID'] == target_session[0]] 

# Locate offsets between 0 and 60 in the provided data, according to the target session
target_df = target_df[(target_df['TIME_OFFSET'] == 0) | (target_df['TIME_OFFSET'] == 5) | 
                      (target_df['TIME_OFFSET'] == 10) | (target_df['TIME_OFFSET'] == 15) |
                      (target_df['TIME_OFFSET'] == 30) | (target_df['TIME_OFFSET'] == 60)]

# Locate target/initial second in the target session 
target_df = target_df.sort_values(by=['SESSION_TIME'])
target_sec = target_df[target_df['SESSION_TIME'] == target_df['SESSION_TIME'].iloc[0]]

# Identify number of offsets available to the session - required for logic related to
# prediction pipeline
n_horizon = target_sec['TIME_OFFSET'].nunique() 

# Localize top of session and truncate prediction dataframe to isolate the rows
# relating to the target session/second
target = target_sec.head(n_horizon).sort_values(by=['TIME_OFFSET'])

# Logic for prediction pipeline. Please see explanation provided 2 chunks above.
# forecast values are populated from either the respective offset observation,
# or from the largest offset observation values available.
if n_horizon == 6:
    print('Session Length is >= 60 minutes in duration')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[5]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[5]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
    
elif n_horizon == 5:
    print('Session Length is 30-60 minutes in duration')
    print('Confidence for the 60 minute prediction is reduced')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[4]]))
    weath60_pred = round(weath60_pred[0])
    
elif n_horizon == 4:
    print('Session Length is 15-30 minutes in duration')
    print('Confidence for the 30 & 60 minutes prediction is reduced')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[3]]))
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
    
elif n_horizon == 3:
    print('Session Length is 10-15 minutes in duration')
    print('Confidence for the 15, 30 and 60 minute predictions is reduced')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[2]]))
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
    
elif n_horizon == 2:
    print('Session Length is 5-10 minutes in duration')
    print('Confidence for the predictions over 5 minutes is reduced')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[1]]))
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])
    
elif n_horizon == 1:
    print('Session Length is <5 minutes in duration')
    print('Confidence for all predictions is reduced')
    weath5_pred = weath5_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec5_pred = prec5_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath10_pred = weath10_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec10_pred = prec10_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath15_pred = weath15_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec15_pred = prec15_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath30_pred = weath30_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec30_pred = prec30_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath60_pred = weath60_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    prec60_pred = prec60_mdl.predict(xgb.DMatrix(target.iloc[[0]]))
    weath5_pred =  round(weath5_pred[0])
    weath10_pred = round(weath10_pred[0])
    weath15_pred = round(weath15_pred[0])
    weath30_pred = round(weath30_pred[0])
    weath60_pred = round(weath60_pred[0])


Session Length is <5 minutes in duration
Confidence for all predictions is reduced


In [5]:
# Build output dictionary for prediction as outlined in documentation
output = { 5: {'type': weath5_pred,  'rain_percentage': prec5_pred[0]},
          10: {'type': weath10_pred, 'rain_percentage': prec10_pred[0]},
          15: {'type': weath15_pred, 'rain_percentage': prec15_pred[0]},
          30: {'type': weath30_pred, 'rain_percentage': prec30_pred[0]},
          60: {'type': weath60_pred, 'rain_percentage': prec60_pred[0]}}

output

{5: {'type': 0, 'rain_percentage': 2.4603262},
 10: {'type': 1, 'rain_percentage': 5.6672416},
 15: {'type': 2, 'rain_percentage': 15.719559},
 30: {'type': 0, 'rain_percentage': 3.7601113},
 60: {'type': 0, 'rain_percentage': 5.0989795}}