In [None]:
# Initialize NumerAPI - the official Python API client for Numerai
from numerapi import NumerAPI
import json
import pandas as pd
import lightgbm as lgb
import cloudpickle

# Configuration
DATA_VERSION = "v5.0"
FEATURE_SET_SIZE = "medium"

# Initialize API client
napi = NumerAPI()

# List available datasets and versions
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))

# Print all files available for download for our version
current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]

# Download and load feature metadata
napi.download_dataset(f"{DATA_VERSION}/features.json")
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))

# Display feature set sizes
feature_sets = feature_metadata["feature_sets"]
feature_set = feature_sets[FEATURE_SET_SIZE]

# Download and load training data
napi.download_dataset(f"{DATA_VERSION}/train.parquet")
train = pd.read_parquet(
    f"{DATA_VERSION}/train.parquet",
    columns=["era", "target"] + feature_set
)

# Downsample to every 4th era to reduce memory usage and speedup model training
train = train[train["era"].isin(train["era"].unique()[::4])]

# Define model
model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=5,
    num_leaves=2**5-1,
    colsample_bytree=0.1
)

# Train model
model.fit(
    train[feature_set],
    train["target"]
)

# Download and load validation data
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")
validation = pd.read_parquet(
    f"{DATA_VERSION}/validation.parquet",
    columns=["era", "data_type", "target"] + feature_set
)
validation = validation[validation["data_type"] == "validation"]
del validation["data_type"]

# Downsample validation data
validation = validation[validation["era"].isin(validation["era"].unique()[::4])]

# Apply embargo to avoid data leakage
last_train_era = int(train["era"].unique()[-1])
eras_to_embargo = [str(era).zfill(4) for era in [last_train_era + i for i in range(4)]]
validation = validation[~validation["era"].isin(eras_to_embargo)]

# Generate validation predictions
validation["prediction"] = model.predict(validation[feature_set])

# Install and import scoring tools
!pip install -q --no-deps numerai-tools
from numerai_tools.scoring import numerai_corr, correlation_contribution

# Download and join meta_model for validation
napi.download_dataset(f"v4.3/meta_model.parquet", round_num=842)
validation["meta_model"] = pd.read_parquet(
    f"v4.3/meta_model.parquet"
)["numerai_meta_model"]

# Calculate performance metrics
per_era_corr = validation.groupby("era").apply(
    lambda x: numerai_corr(x[["prediction"]].dropna(), x["target"].dropna())
)
per_era_mmc = validation.dropna().groupby("era").apply(
    lambda x: correlation_contribution(x[["prediction"]], x["meta_model"], x["target"])
)

# Plot correlation metrics
per_era_corr.plot(
    title="Validation CORR",
    kind="bar",
    figsize=(8, 4),
    xticks=[],
    legend=False,
    snap=False
)
per_era_mmc.plot(
    title="Validation MMC",
    kind="bar",
    figsize=(8, 4),
    xticks=[],
    legend=False,
    snap=False
)

# Plot cumulative metrics
per_era_corr.cumsum().plot(
    title="Cumulative Validation CORR",
    kind="line",
    figsize=(8, 4),
    legend=False
)
per_era_mmc.cumsum().plot(
    title="Cumulative Validation MMC",
    kind="line",
    figsize=(8, 4),
    legend=False
)

# Calculate summary statistics
corr_mean = per_era_corr.mean()
corr_std = per_era_corr.std(ddof=0)
corr_sharpe = corr_mean / corr_std
corr_max_drawdown = (per_era_corr.cumsum().expanding(min_periods=1).max() - per_era_corr.cumsum()).max()

mmc_mean = per_era_mmc.mean()
mmc_std = per_era_mmc.std(ddof=0)
mmc_sharpe = mmc_mean / mmc_std
mmc_max_drawdown = (per_era_mmc.cumsum().expanding(min_periods=1).max() - per_era_mmc.cumsum()).max()

# Display performance summary
summary_df = pd.DataFrame({
    "mean": [corr_mean, mmc_mean],
    "std": [corr_std, mmc_std],
    "sharpe": [corr_sharpe, mmc_sharpe],
    "max_drawdown": [corr_max_drawdown, mmc_max_drawdown]
}, index=["CORR", "MMC"]).T

# Download and process live data
napi.download_dataset(f"{DATA_VERSION}/live.parquet")
live_features = pd.read_parquet(f"{DATA_VERSION}/live.parquet", columns=feature_set)
live_predictions = model.predict(live_features[feature_set])

# Define prediction pipeline function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[feature_set])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# Serialize prediction function
p = cloudpickle.dumps(predict)
with open("hello_numerai.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('hello_numerai.pkl')
except:
    pass