In [1]:
from numerapi import NumerAPI
from pathlib import Path
import os

In [2]:
print('Downloading dataset files...')
napi = NumerAPI()
current_round = napi.get_current_round()
Path("./v4").mkdir(parents=False, exist_ok=True)
if not os.path.exists("v4/train.parquet"):
    napi.download_dataset("v4/train.parquet")
if not os.path.exists("v4/validation.parquet"):
    napi.download_dataset("v4/validation.parquet")
if not os.path.exists(f"v4/live_{current_round}.parquet"):
    napi.download_dataset("v4/live.parquet", f"v4/live_{current_round}.parquet")
if not os.path.exists("v4/validation_example_preds.parquet"):
    napi.download_dataset("v4/validation_example_preds.parquet")
if not os.path.exists("v4/features.json"):
    napi.download_dataset("v4/features.json")

Downloading dataset files...


In [3]:
# read the feature metadata and get a feature set (or all the features)
import json
print('Reading minimal training data')
with open("v4/features.json", "r") as f:
    feature_metadata = json.load(f)

Reading minimal training data


In [4]:
# features = list(feature_metadata["feature_stats"].keys()) # get all the features
# features = feature_metadata["feature_sets"]["small"] # get the small feature set
features = feature_metadata["feature_sets"]["medium"] # get the medium feature set
features[:10]

['feature_abating_unadaptable_weakfish',
 'feature_ablest_mauritanian_elding',
 'feature_acclimatisable_unfeigned_maghreb',
 'feature_accommodable_crinite_cleft',
 'feature_acetose_periotic_coronation',
 'feature_additive_untrustworthy_hierologist',
 'feature_adsorbed_blizzardy_burlesque',
 'feature_affettuoso_taxidermic_greg',
 'feature_afoul_valvate_faery',
 'feature_agaze_lancinate_zohar']

In [5]:
# load training data
import pandas as pd
target = "target"
columns = features + [target]
training_data = pd.read_parquet('v4/train.parquet', columns=columns)

In [6]:
len(training_data)

2420521

In [7]:
training_data = training_data.dropna()

In [None]:
len(training_data)

In [8]:
training_data = training_data.head(100000)
training_data.head()

Unnamed: 0_level_0,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_acetose_periotic_coronation,feature_additive_untrustworthy_hierologist,feature_adsorbed_blizzardy_burlesque,feature_affettuoso_taxidermic_greg,feature_afoul_valvate_faery,feature_agaze_lancinate_zohar,...,feature_weightiest_protozoic_brawler,feature_wetter_unbaffled_loma,feature_wheezier_unjaundiced_game,feature_wistful_tussive_cycloserine,feature_witchy_orange_muley,feature_wombed_liberatory_malva,feature_wrathful_prolix_colotomy,feature_wrought_muckier_temporality,feature_yelled_hysteretic_eath,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0.0,1.0,0.0,1.0,0.0,0.25,1.0,0.0,0.75,0.0,...,1.0,0.0,0.5,1.0,0.0,0.0,1.0,1.0,0.5,0.25
n003bee128c2fcfc,1.0,0.5,0.5,0.5,0.75,0.25,0.75,0.5,0.5,0.75,...,0.0,0.75,0.0,0.75,0.75,0.75,0.0,0.5,0.75,0.75
n0048ac83aff7194,1.0,1.0,0.5,0.0,0.0,1.0,0.25,0.75,0.5,0.5,...,0.5,0.75,0.25,0.0,0.0,0.25,0.0,0.75,0.25,0.5
n00691bec80d3e02,0.25,1.0,0.25,0.25,0.0,0.5,0.25,0.5,1.0,0.25,...,0.5,1.0,0.0,0.0,0.5,0.25,0.5,0.75,0.5,0.75
n00b8720a2fdc4f2,0.0,0.5,0.0,0.0,0.0,0.75,0.25,0.0,0.5,0.0,...,0.0,0.5,0.25,0.75,0.25,0.25,0.0,0.0,0.25,0.75


In [9]:
# load test data
import pandas as pd
target = "target"
columns = features + [target]
test_data = pd.read_parquet('v4/validation.parquet', columns=columns)
len(test_data)

2245885

In [10]:
test_data = test_data.dropna()
len(test_data)

2219626

In [None]:
from pycaret.classification import *
reg101 = setup(data = training_data, test_data=test_data,target = 'target', normalize = True, session_id=123)

2022-06-17 20:32:23,073 INFO logs: PyCaret Supervised Module
2022-06-17 20:32:23,090 INFO logs: ML Usecase: classification
2022-06-17 20:32:23,091 INFO logs: version 2.3.10
2022-06-17 20:32:23,091 INFO logs: Initializing setup()
2022-06-17 20:32:23,091 INFO logs: setup(target=target, ml_usecase=classification, available_plots={'parameter': 'Hyperparameters', 'auc': 'AUC', 'confusion_matrix': 'Confusion Matrix', 'threshold': 'Threshold', 'pr': 'Precision Recall', 'error': 'Prediction Error', 'class_report': 'Class Report', 'rfe': 'Feature Selection', 'learning': 'Learning Curve', 'manifold': 'Manifold Learning', 'calibration': 'Calibration Curve', 'vc': 'Validation Curve', 'dimension': 'Dimensions', 'feature': 'Feature Importance', 'feature_all': 'Feature Importance (All)', 'boundary': 'Decision Boundary', 'lift': 'Lift Chart', 'gain': 'Gain Chart', 'tree': 'Decision Tree', 'ks': 'KS Statistic Plot'}, train_size=0.7, test_data=                  feature_abating_unadaptable_weakfish  \
id

IntProgress(value=0, description='Processing: ', max=3)

2022-06-17 20:32:24,112 INFO logs: Importing libraries
2022-06-17 20:32:24,113 INFO logs: Copying data for preprocessing
2022-06-17 20:32:24,165 INFO logs: Declaring preprocessing parameters
2022-06-17 20:32:24,215 INFO logs: Creating preprocessing pipeline
2022-06-17 20:32:27,292 INFO logs: Preprocessing pipeline created successfully
2022-06-17 20:32:27,293 ERROR logs: (Process Exit): setup has been interupted with user command 'quit'. setup must rerun.
2022-06-17 20:32:27,294 INFO logs: Creating global containers
2022-06-17 20:32:27,316 INFO logs: Internal pipeline: Pipeline(memory=None, steps=[('empty_step', 'passthrough')], verbose=False)


Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
feature_abating_unadaptable_weakfish,Numeric
feature_ablest_mauritanian_elding,Numeric
feature_acclimatisable_unfeigned_maghreb,Numeric
feature_accommodable_crinite_cleft,Numeric
feature_acetose_periotic_coronation,Numeric
...,...
feature_wombed_liberatory_malva,Numeric
feature_wrathful_prolix_colotomy,Numeric
feature_wrought_muckier_temporality,Numeric
feature_yelled_hysteretic_eath,Numeric


 


In [None]:
best = compare_models(errors="raise")

In [None]:
model_name = "lightgbm"

In [None]:
model = create_model(model_name)