# 1. Dependencies

In [None]:
# Install Dependencies
!pip install pandas sklearn numerapi

In [None]:
# Import Dependencies
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import numerapi, time, warnings, itertools
from tensorflow.keras.callbacks import ModelCheckpoint, LambdaCallback, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import RMSprop, Adam
from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical

pd.set_option('chained_assignment', None)

# ignore warning messages
warnings.filterwarnings('ignore')

## Background Functions

In [None]:
def get_group_stats(df: pd.DataFrame) -> pd.DataFrame:
        for group in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]:
            cols = [col for col in df.columns if group in col]
            df[f"feature_{group}_mean"] = df[cols].mean(axis=1)
            df[f"feature_{group}_std"] = df[cols].std(axis=1)
            df[f"feature_{group}_skew"] = df[cols].skew(axis=1)
        return df


def sharpe_ratio(corrs: pd.Series) -> np.float32:
        """
        Calculate the Sharpe ratio for Numerai by using grouped per-era data

        :param corrs: A Pandas Series containing the Spearman correlations for each era
        :return: A float denoting the Sharpe ratio of your predictions.
        """
        return corrs.mean() / corrs.std()


def evaluate(df: pd.DataFrame) -> tuple:
        """
        Evaluate and display relevant metrics for Numerai 

        :param df: A Pandas DataFrame containing the columns "era", "target" and a column for predictions
        :param pred_col: The column where the predictions are stored
        :return: A tuple of float containing the metrics
        """
        def _score(sub_df: pd.DataFrame) -> np.float32:
            """Calculates Spearman correlation"""
            return spearmanr(sub_df["target"], sub_df["prediction"])[0]

        # Calculate metrics
        corrs = df.groupby("era").apply(_score)
        print(corrs)
        payout_raw = (corrs / 0.2).clip(-1, 1)
        spearman = round(corrs.mean(), 4)

        payout = round(payout_raw.mean(), 4)
        numerai_sharpe = round(sharpe_ratio(corrs), 4)
        mae = mean_absolute_error(df["target"], df["prediction"]).round(4)

        # Display metrics
        print(f"Spearman Correlation: {spearman}")
        print(f"Average Payout: {payout}")
        print(f"Sharpe Ratio: {numerai_sharpe}")
        print(f"Mean Absolute Error (MAE): {mae}")
        return spearman, payout, numerai_sharpe, mae

# 2. Numerai Tournament API setup

In [None]:
# Get your API keys and model_id from https://numer.ai/submit
public_id = "INSERT PUBLIC ID"
secret_key = "INSERT SECRET KEY"
model_id = "INSERT MODEL ID"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

# 3. Download Data Sets

### Datasets 
*   `trainingData` is used to train the model
*   `tournamentData` is used to evaluate the model

### Column descriptions
*   id: a randomized id that corresponds to a stock 
*   era: a period of time
*   data_type: either `train`, `validation`, `test`, or `live` 
*   feature_*: abstract financial features of the stock 
*   target: abstract measure of stock performance


In [None]:
# Download Training Data From Numerai
start = time.time()
print(f"[{time.asctime()}] Downloading the lastest training data set. Current round is: {numerapi.NumerAPI(verbosity='info').get_current_round()}...\n")
trainingData = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_training_data.csv.xz", header=0)
end = time.time()
print(f"[{time.asctime()}] Training dataset has been loaded. It took {end - start:0.2f} seconds")

In [None]:
# Download Tournament Data From Numerai
start = time.time()
print(f"[{time.asctime()}] Downloading the lastest tournament data set. Current round is: {numerapi.NumerAPI(verbosity='info').get_current_round()}...\n")
tournamentData = pd.read_csv("https://numerai-public-datasets.s3-us-west-2.amazonaws.com/latest_numerai_tournament_data.csv.xz", header=0)
end = time.time()
print(f"[{time.asctime()}] Tournament dataset has been loaded. It took {end - start:0.2f} seconds")

# 4. Explore The Dataset

In [None]:
# Print Training Data
trainingData.head()

In [None]:
# Find Unique Items Within The Training Data
print(f'{len(trainingData.era.unique())} UNIQUE ERAS: {trainingData.era.unique()}\n')
print(f'{len(trainingData.data_type.unique())} UNIQUE DATA TYPE: {trainingData.data_type.unique()}')

In [None]:
# Print Tournament Data
tournamentData.head()

In [None]:
# Find Unique Items Within Tournament Data
print(f'{len(tournamentData.data_type.unique())} UNIQUE DATA TYPE: {tournamentData.data_type.unique()}\n')
print(f'{len(tournamentData.era.unique())} UNIQUE ERAS: {tournamentData.era.unique()}')

In [None]:
# Select Validation Data Out of Tournament Dataset
validationData = tournamentData[tournamentData.data_type=='validation']

# Select Test Data Out of Tournament Dataset
testData = tournamentData[tournamentData.data_type=='test']

# Select Live Data Out of Tournament Dataset
liveData = tournamentData[tournamentData.data_type=='live']

In [None]:
print(f"UNIQUE TRAINING TARGETS: {trainingData.target[trainingData.data_type=='train'].unique()}")
print(f"UNIQUE VALIDATION TARGETS: {tournamentData.target[tournamentData.data_type=='validation'].unique()}")
print(f"UNIQUE TEST TARGETS: {tournamentData.target[tournamentData.data_type=='test'].unique()}")
print(f"UNIQUE LIVE TARGETS: {tournamentData.target[tournamentData.data_type=='live'].unique()}")

In [None]:
# Plot heatmap of feature correlation
plt.figure(figsize=(30,30))
sns.heatmap(trainingData.corr())

In [None]:
# Extract era numbers
trainingData["erano"] = trainingData.era.str.slice(3).astype(int)
plt.figure(figsize=[14, 6])
trainingData.groupby(trainingData['erano'])["target"].size().plot(title="Era sizes", figsize=(14, 8))

In [None]:
feats = [f for f in trainingData.columns if "feature" in f]
plt.figure(figsize=(15, 5))
sns.histplot(pd.DataFrame(trainingData[feats].std()),bins=100)
plt.legend(["Train"], fontsize=20)
plt.title("Standard deviations over training features in the data", weight='bold', fontsize=20)

In [None]:
feats = [f for f in trainingData.columns if "feature" in f]
plt.figure(figsize=(15, 5))
sns.histplot(pd.DataFrame(validationData[feats].std()),bins=100)
plt.legend(["Val"], fontsize=20)
plt.title("Standard deviations over validation features in the data", weight='bold', fontsize=20)

In [None]:
feats = [f for f in trainingData.columns if "feature" in f]
plt.figure(figsize=(15, 5))
sns.histplot(pd.DataFrame(testData[feats].std()), bins=100)
plt.legend(["Test"], fontsize=20)
plt.title("Standard deviations over test features in the data", weight='bold', fontsize=20)

In [None]:
feats = [f for f in trainingData.columns if "feature" in f]
plt.figure(figsize=(15, 5))
sns.distplot(pd.DataFrame(trainingData[feats].std()),bins=100)
sns.distplot(pd.DataFrame(validationData[feats].std()),bins=100)
sns.distplot(pd.DataFrame(testData[feats].std()), bins=100)
plt.legend(["Train", "Val", "Test"], fontsize=20)
plt.title("Standard deviations over all features in the data", weight='bold', fontsize=20)

# 5. Feature Engineering

In [None]:
# Feature Correlation With Target Based On Era

# Extract Unique Eras From Training Data
eras = list(trainingData.era.unique())
eraList = []
for era in eras:
    eraData = trainingData[trainingData.era==era]

    # Calculate Correlations With Target
    eraCorr = eraData.corr()
    corrWithTarget = eraCorr["target"].T.apply(abs).sort_values(ascending=False)

    # Select Features With Highest Correlation To The Target Variable
    features = corrWithTarget[:20]
    features.drop("target", inplace=True)

    featureList = features.tolist()
    eraList.append(featureList)

    # # Write To A File
    # with open("Correlations.txt",'a') as f:
    #     f.write(f"Top 10 Features in {era} according to correlation with target:\n")
    #     f.write(f'{features[:10]}\n\n')

In [None]:
eraCorrList, topEras = [], []

for (era, corrs) in zip(eras, eraList):
    # Find Correlation Average Based On Era
    corrTot = 0
    for corr in corrs:
        corrTot += corr
    corrAVG = corrTot / len(corrs)
    eraCorrList.append([era, corrAVG])

# Sort Era Correlation List By Correlation Average
eraCorrList.sort(key=lambda eraCorrList: eraCorrList[1], reverse=True)

# Select The Top Correlated Eras
for i in range(len(eraCorrList)):
    if i == 20:
        break
    topEras.append(eraCorrList[i][0])

In [None]:
# Create New Training Data Set
headers = [h for h in trainingData.columns]
dataFrameList = [pd.DataFrame(columns=headers)]
for eras in topEras:
    df = trainingData[trainingData.era==eras]]
    dataFrameList.append(df)
# eras = [era for era in topEras]
# print(f"[{time.asctime()}] Creating new training data set based on top correlated features.")
# trainingDataEng = pd.concat([trainingDataEng,trainingData[trainingData.era==eras]],axis=1)
# print(f"[{time.asctime()}] Finished creating new training data set based on top correlated features.")

In [None]:
trainingDataEng

In [None]:
interactions = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

interactions.fit(trainingData[featureList], trainingData["target"])

X_train_interact = pd.DataFrame(interactions.transform(trainingData[featureList]))

train=pd.concat([trainingData,X_train_interact],axis=1)
train

In [None]:
# Select Features From Training Data
trainingDataX = to_categorical(trainingDataEng[trainingDataEng.columns[trainingDataEng.columns.str.startswith('feature')]])

# Select Targets From Training Data
trainingDataY = trainingData[trainingData.columns[trainingData.columns.str.startswith('target')]]

# Converrt to numpy arrays
trainingDataX = np.array(trainingDataX)
trainingDataY = np.array(trainingDataY)

In [None]:
# Split Up Data
xTrain, xTest, yTrain, yTest = train_test_split(trainingDataX, trainingDataY, test_size = 0.25)

# 6. Train Model

In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, verbose=1, random_state=42, n_jobs=-2)

# Train the model on training data
rf.fit(xTrain, yTrain.flatten())

# 7. Validation

In [None]:
validationDataX = validationData[feature_list]
validationDataX = to_categorical(validationDataX[validationDataX.columns[validationDataX.columns.str.startswith('feature')]])

validationDataY = validationData[validationData.columns[validationData.columns.str.startswith('target')]]
validationDataX

In [None]:
predictions = rf.predict(validationDataX)

In [None]:
validationDataY.values.flatten()

In [None]:
predictions

# 8. Generate Predictions

In [None]:
# Performance metrics
errors = abs(predictions - validationDataY)
print('Metrics for Random Forest Trained on Expanded Data')
print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = np.mean(100 * (errors / test_labels))
# Compare to baseline
improvement_baseline = 100 * abs(mape - baseline_mape) / baseline_mape
print('Improvement over baseline:', round(improvement_baseline, 2), '%.')
# Calculate and display accuracy
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# predictions must have an `id` column and a `prediction_kazutsugi` column
predictions_df = tournament_data["id"].to_frame()
predictions_df["prediction_kazutsugi"] = predictions
predictions_df.head()

In [None]:
# Upload your predictions
predictions_df.to_csv("predictions.csv", index=False)
submission_id = napi.upload_predictions("predictions.csv", model_id=model_id)

# 9. Works Cited
- https://tit-btcqash.medium.com/a-comprehensive-guide-to-competing-at-numerai-70b356edbe07
- https://towardsdatascience.com/random-forest-in-python-24d0893d51c0
- https://realpython.com/python-timer/
- https://www.kaggle.com/carlolepelaars/how-to-get-started-with-numerai
- https://medium.com/machine-learning-in-practice/cheat-sheet-of-machine-learning-and-python-and-math-cheat-sheets-a4afe4e791b6
- https://towardsdatascience.com/a-guide-to-the-hardest-data-science-tournament-on-the-planet-748f46e83690
- https://towardsdatascience.com/improving-random-forest-in-python-part-1-893916666cd
- https://www.geeksforgeeks.org/python-flatten-a-2d-numpy-array-into-1d-array/
- https://docs.numer.ai/tournament/learn
- https://forum.numer.ai/t/advice-from-the-kaggle-which-ive-found-very-useful/300
- https://forum.numer.ai/t/model-diagnostics-feature-exposure/899
- https://towardsdatascience.com/data-correlation-can-make-or-break-your-machine-learning-project-82ee11039cc9
- https://www.geeksforgeeks.org/python-sort-list-of-list-by-specified-index/
- 