In [1]:
from numerapi import NumerAPI
from pathlib import Path
import os

In [2]:
print('Downloading dataset files...')
napi = NumerAPI()
current_round = napi.get_current_round()
Path("./v4").mkdir(parents=False, exist_ok=True)
if not os.path.exists("v4/train.parquet"):
    napi.download_dataset("v4/train.parquet")
if not os.path.exists("v4/validation.parquet"):
    napi.download_dataset("v4/validation.parquet")
if not os.path.exists(f"v4/live_{current_round}.parquet"):
    napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
if not os.path.exists("v4/validation_example_preds.parquet"):
    napi.download_dataset("v4/validation_example_preds.parquet")
if not os.path.exists("v4/features.json"):
    napi.download_dataset("v4/features.json")

Downloading dataset files...


In [3]:
# read the feature metadata and get a feature set (or all the features)
import json
print('Reading minimal training data')
with open("v4/features.json", "r") as f:
    feature_metadata = json.load(f)

Reading minimal training data


In [4]:
# features = list(feature_metadata["feature_stats"].keys()) # get all the features
# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
features[:10]

['feature_abating_unadaptable_weakfish',
 'feature_ablest_mauritanian_elding',
 'feature_acclimatisable_unfeigned_maghreb',
 'feature_accommodable_crinite_cleft',
 'feature_acetose_periotic_coronation',
 'feature_additive_untrustworthy_hierologist',
 'feature_adsorbed_blizzardy_burlesque',
 'feature_affettuoso_taxidermic_greg',
 'feature_afoul_valvate_faery',
 'feature_agaze_lancinate_zohar']

In [5]:
# load training data
import pandas as pd
target = "target"
columns = features + [target]
training_data = pd.read_parquet('v4/train.parquet', columns=columns)

In [6]:
training_data.keys()

Index(['feature_abating_unadaptable_weakfish',
       'feature_ablest_mauritanian_elding',
       'feature_acclimatisable_unfeigned_maghreb',
       'feature_accommodable_crinite_cleft',
       'feature_acetose_periotic_coronation',
       'feature_additive_untrustworthy_hierologist',
       'feature_adsorbed_blizzardy_burlesque',
       'feature_affettuoso_taxidermic_greg', 'feature_afoul_valvate_faery',
       'feature_agaze_lancinate_zohar',
       ...
       'feature_weightiest_protozoic_brawler', 'feature_wetter_unbaffled_loma',
       'feature_wheezier_unjaundiced_game',
       'feature_wistful_tussive_cycloserine', 'feature_witchy_orange_muley',
       'feature_wombed_liberatory_malva', 'feature_wrathful_prolix_colotomy',
       'feature_wrought_muckier_temporality', 'feature_yelled_hysteretic_eath',
       'target'],
      dtype='object', length=473)

In [7]:
training_data = training_data.head(100000)
training_data.head()

Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,feature_agaze_lancinate_zohar,...,feature_weightiest_protozoic_brawler,feature_wetter_unbaffled_loma,feature_wheezier_unjaundiced_game,feature_wistful_tussive_cycloserine,feature_witchy_orange_muley,feature_wombed_liberatory_malva,feature_wrathful_prolix_colotomy,feature_wrought_muckier_temporality,feature_yelled_hysteretic_eath,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0.0,1.0,0.0,1.0,0.0,0.25,1.0,0.0,0.75,0.0,...,1.0,0.0,0.5,1.0,0.0,0.0,1.0,1.0,0.5,0.25
n003bee128c2fcfc,1.0,0.5,0.5,0.5,0.75,0.25,0.75,0.5,0.5,0.75,...,0.0,0.75,0.0,0.75,0.75,0.75,0.0,0.5,0.75,0.75
n0048ac83aff7194,1.0,1.0,0.5,0.0,0.0,1.0,0.25,0.75,0.5,0.5,...,0.5,0.75,0.25,0.0,0.0,0.25,0.0,0.75,0.25,0.5
n00691bec80d3e02,0.25,1.0,0.25,0.25,0.0,0.5,0.25,0.5,1.0,0.25,...,0.5,1.0,0.0,0.0,0.5,0.25,0.5,0.75,0.5,0.75
n00b8720a2fdc4f2,0.0,0.5,0.0,0.0,0.0,0.75,0.25,0.0,0.5,0.0,...,0.0,0.5,0.25,0.75,0.25,0.25,0.0,0.0,0.25,0.75


In [21]:
from pycaret.classification import *
reg101 = setup(data = training_data, target = 'target', normalize = True, session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(100000, 473)"
5,Missing Values,False
6,Numeric Features,472
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


2022-06-17 15:21:00,281 INFO logs: create_model_container: 0
2022-06-17 15:21:00,282 INFO logs: master_model_container: 0
2022-06-17 15:21:00,282 INFO logs: display_container: 1
2022-06-17 15:21:00,285 INFO logs: Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[], target='target',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_strat...
                                                  target='target')),
     

In [25]:
best = compare_models(errors="raise")

2022-06-17 15:35:11,236 INFO logs: Initializing compare_models()
2022-06-17 15:35:11,237 INFO logs: compare_models(include=None, fold=None, round=4, cross_validation=True, sort=Accuracy, n_select=1, budget_time=None, turbo=True, errors=raise, fit_kwargs=None, groups=None, experiment_custom_tags=None, probability_threshold=None, verbose=True, display=None, exclude=None)
2022-06-17 15:35:11,238 INFO logs: Checking exceptions
2022-06-17 15:35:11,240 INFO logs: Preparing display monitor
2022-06-17 15:35:11,240 INFO logs: Preparing display monitor


IntProgress(value=0, description='Processing: ', max=84)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)


2022-06-17 15:35:11,258 INFO logs: Initializing Logistic Regression
2022-06-17 15:35:11,258 INFO logs: Total runtime is 6.953875223795573e-06 minutes
2022-06-17 15:35:11,263 INFO logs: Initializing create_model()
2022-06-17 15:35:11,264 INFO logs: create_model(estimator=lr, fold=StratifiedKFold(n_splits=10, random_state=None, shuffle=False), round=4, cross_validation=True, predict=True, fit_kwargs={}, groups=None, refit=False, verbose=False, system=False, metrics=None, experiment_custom_tags=None, add_to_model_list=True, probability_threshold=None, display=<pycaret.internal.Display.Display object at 0x7f1a6d149100>, return_train_score=False, kwargs={})
2022-06-17 15:35:11,264 INFO logs: Checking exceptions
2022-06-17 15:35:11,264 INFO logs: Importing libraries
2022-06-17 15:35:11,265 INFO logs: Copying training dataset
2022-06-17 15:35:11,281 INFO logs: Defining folds
2022-06-17 15:35:11,282 INFO logs: Declaring metric variables
2022-06-17 15:35:11,286 INFO logs: Importing untrained mo

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.

In [19]:
model_name = "lightgbm"

In [20]:
model = create_model(model_name)

2022-06-17 15:16:12,788 INFO logs: Initializing create_model()
2022-06-17 15:16:12,790 INFO logs: create_model(estimator=lightgbm, fold=None, round=4, cross_validation=True, predict=True, fit_kwargs=None, groups=None, refit=True, verbose=True, system=True, metrics=None, experiment_custom_tags=None, add_to_model_list=True, probability_threshold=None, display=None, return_train_score=False, kwargs={})
2022-06-17 15:16:12,790 INFO logs: Checking exceptions
2022-06-17 15:16:12,791 INFO logs: Preparing display monitor


IntProgress(value=0, description='Processing: ', max=4)

Unnamed: 0,Fold,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


2022-06-17 15:16:12,803 INFO logs: Importing libraries
2022-06-17 15:16:12,804 INFO logs: Copying training dataset
2022-06-17 15:16:12,820 INFO logs: Defining folds
2022-06-17 15:16:12,821 INFO logs: Declaring metric variables
2022-06-17 15:16:12,825 INFO logs: Importing untrained model
2022-06-17 15:16:12,830 INFO logs: Light Gradient Boosting Machine Imported succesfully
2022-06-17 15:16:12,837 INFO logs: Starting cross validation
2022-06-17 15:16:12,838 INFO logs: Cross validating with StratifiedKFold(n_splits=10, random_state=None, shuffle=False), n_jobs=-1


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.