In [None]:
import os, sys
sys.path.append('../src/')
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import data.dataset as dtst
import features.preprocessing as prep
import visualization.visualize as vis

np.random.seed(42)

In [None]:
stocks_interim_folderpath = '../data/interim/stocks'

n_files_train = 4000
selected_files = np.random.choice(os.listdir(stocks_interim_folderpath), n_files_train)

# selected_files = np.array(['CAAP.csv', 'UBG.csv', 'TM.csv', 'THFF.csv', 'NREF.csv', 'LEE.csv',
#                            'WRLSU.csv', 'AU.csv', 'TTM.csv', 'RBCAA.csv'])

# selected_files

In [None]:
stocks_data_train = dtst.load_rawdata(stocks_interim_folderpath, selected_files=selected_files)
print("Number of assets:", len(stocks_data_train))
stocks_data_train.keys()

In [None]:
stocks_data_train[list(stocks_data_train.keys())[0]]

In [None]:
vis.plot_assets_timeline(stocks_data_train, subplotsize=(14, 2))

In [None]:
un_selected_files = np.array(os.listdir(stocks_interim_folderpath))
un_selected_files = un_selected_files[~np.isin(un_selected_files, selected_files)]
# un_selected_files = un_selected_files[0:50]

In [None]:
stocks_data_test = dtst.load_rawdata(stocks_interim_folderpath, selected_files=un_selected_files)
print("Number of assets:", len(stocks_data_test))
stocks_data_test.keys()

# Labelling the stock trend

In [None]:
price_ref_col = 'Close'
n_samples_min = 3000

In [None]:
# Labelling the train dataset:
stocks_data_train_keys = list(stocks_data_train.keys())
for key in stocks_data_train_keys:
    df = stocks_data_train[key]
    if df.shape[0] > n_samples_min:
        df['y_target'] = dtst.trend_labeling(df, price_ref_col, key, figsize=(14, 8), plot_results=True)
        df['ticker'] = key
        stocks_data_train[key] = df
    else:
        del stocks_data_train[key]

In [None]:
# Labelling the test dataset:
stocks_data_test_keys = list(stocks_data_test.keys())
for key in stocks_data_test_keys:
    df = stocks_data_test[key]
    if df.shape[0] > n_samples_min:
        df['y_target'] = dtst.trend_labeling(df, price_ref_col, key, figsize=(14, 8), plot_results=False)
        df['ticker'] = key
        stocks_data_test[key] = df
    else:
        del stocks_data_test[key]

In [None]:
# Saving the processed data:
stocks_processed_folderpath = '../data/processed'
for key,data in stocks_data_train.items():
    data.to_csv(f'{stocks_processed_folderpath}/train/{key}.csv', index=False)
for key,data in stocks_data_test.items():
    data.to_csv(f'{stocks_processed_folderpath}/test/{key}.csv', index=False)

In [None]:
print("Train data samples:", len(stocks_data_train))
print("Test data samples:", len(stocks_data_test))