In [1]:
# importing required packages
import pandas as pd
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error as mape
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})
from tqdm import tqdm
from sklearn.metrics import mean_absolute_percentage_error as mape
import numpy as np
import warnings
import statistics
import pickle

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
# import train and test data
test = pd.read_csv('model_data/model_test.csv')
train = pd.read_csv('model_data/model_train.csv')
prophet_predictions = pd.read_csv('prophet_predictions/predictions.csv')

In [3]:
# stocks that did not work for modelling
banned_stocks = [9755, 4686, 6058, 6502, 6815]
# test data with out these stocks
test_wo_errors = test.query('SecuritiesCode not in @banned_stocks')

In [4]:
# list of industries models already exist for
industries_with_models = ["ENERGY RESOURCES ", "PHARMACEUTICAL ", "BANKS ", "FINANCIALS （EX BANKS） ", "COMMERCIAL & WHOLESALE TRADE ", "CONSTRUCTION & MATERIALS ", "ELECTRIC POWER & GAS ", "AUTOMOBILES & TRANSPORTATION EQUIPMENT ", "IT & SERVICES, OTHERS ", "RETAIL TRADE ", "REAL ESTATE ", "FOODS ", "ELECTRIC APPLIANCES & PRECISION INSTRUMENTS ", "MACHINERY ", "STEEL & NONFERROUS METALS ", "RAW MATERIALS & CHEMICALS ", "TRANSPORTATION & LOGISTICS "]

In [5]:
def final(test, amount, number):
    '''
    Input: Testdata, amount of money to invest, number of stocks to buy/short
    Output: Performance of an equally weighted ETF, Performance of the Stocks recommended by the model
    '''
    
    # determine number of days to predict
    days = test.Date.nunique()
    
    # calling make_predictions function
    predictions_df = make_predictions(test, days)

    # calling eval function
    eval(test, predictions_df, amount, number)
    

In [6]:
def create_train_test(df, split):


   '''
   Input: DataFrame of stock data, split of test data
   Output: Train data, test data
   '''

    # creating two empty dataframes for train and test data
    train = pd.DataFrame()
    test = pd.DataFrame()

        # looping through SecurityCodes
    for i in df.SecuritiesCode.unique():
        # making two querys for the code with test being the length of split
        train_query = df.query('SecuritiesCode == @i')[:-split]
        test_query = df.query('SecuritiesCode == @i')[-split:]

        # adding the querys to the dataframes
        train = pd.concat([train, train_query])
        test = pd.concat([test, test_query])

        # returning training and test data frames
    return train, test

IndentationError: unexpected indent (2696066977.py, line 10)

In [7]:
def get_performance(df, model):

    '''Input: data of one stock
       Output: performance of the stock in the given timeframe
       '''
    # getting performance for predicted values
    if model == True:
        # making new list for different performances
        performances = []

        # getting starting and end price, calculating performance of the lower predictions and adding it to list
        start_lower = df.ad_Close_lower.to_list()[0]
        end_lower = df.ad_Close_lower.to_list()[-1]
        performance_lower = (end_lower - start_lower) / start_lower
        performances.append(performance_lower)

        # getting starting and end price, calculating performance of the midlle predictions and adding it to list
        start_mid = df.ad_Close.to_list()[0]
        end_mid = df.ad_Close.to_list()[-1]
        performance_mid = (end_mid - start_mid) / start_mid
        performances.append(performance_mid)

        # getting starting and end price, calculating performance of the upper predictions and adding it to list
        start_upper = df.ad_Close_upper.to_list()[0]
        end_upper = df.ad_Close_upper.to_list()[-1]
        performance_upper = (end_upper - start_upper) / start_upper
        performances.append(performance_upper)

        # calculating mean performance
        performance_mean = np.mean(performances)

        # returning mean performance and performance of middle predictions
        return performance_lower, performance_mid
    
    # calculating return for actual values
    elif model == False:

        # getting start and end price, calculating performance and returning it
        start = df.ad_Close.to_list()[0]
        end = df.ad_Close.to_list()[-1]
        performance = (end - start) / start

        return performance

In [8]:
def etf_performance(test, predictions, amount):

    '''Input: test data, predictions of the stocks, amount of money to invest
    Output: Percentage and absolute return of an equally weighted ETF of all stocks that were predicted'''

    # making list for performances
    performances = []

    # looping through stocks
    for i in predictions.SecuritiesCode.unique():
        
        # making query of the test data for one stock, sorting it by Date, calling performance function and adding it to list
        query = test.query('SecuritiesCode == @i').reset_index()
        query = query.sort_values('Date')
        performance = get_performance(query, False)
        performances.append(performance)

    # calculating mean return stocks and absolute return given invested amount
    pct_return = np.mean(performances)
    abs_return = amount * pct_return

    # returning percentage and absolute return of ETF containing all stocks in test data
    return pct_return, abs_return

In [9]:
def model_performance(test, predictions, amount, number):

    '''Input: test data, predictions of stocks, amount of money to invest, number of stocks to buy/ short
    Ouput: percentage and absolute return of stocks recommended to buy/short by the model in the given timeframe'''

    # creating dataframe for predicted performances
    df_performances = pd.DataFrame()
 
    # looping through stocks in test data
    for i in test.SecuritiesCode.unique():
        
        # making query of predictions for one stock, getting performances and adding stock and performances to dataframe
        query = predictions.query('SecuritiesCode == @i')
        performance_mid, performance_mean = get_performance(query, True)
        df_code = pd.DataFrame({'SecuritiesCode': [i], 'performance_mid': [performance_mid],  'performance_mean': [performance_mean]})
        df_performances = pd.concat([df_performances, df_code])

    # saving dataframe
    df_performances.to_csv('prophet_predictions/performance_predictions.csv')

    # getting n best and worst stocks according to predictions by sorting dataframe by performance
    best = df_performances.sort_values('performance_mean', ascending=False).SecuritiesCode.to_list()[:number]
    worst = df_performances.sort_values('performance_mean', ascending=False).SecuritiesCode.to_list()[-number :]

    # making list for actual performances of recommended stocks
    chosen = []

    # looping through best stocks according to predictions
    for i in best:

        # making query of the stock from test data, getting actual performance and adding it to list
        query = test.query('SecuritiesCode == @i').reset_index()
        query = query.sort_values('Date')
        performance = get_performance(query, False)
        chosen.append(performance)
    
    # looping through worst stocks according to predictions
    for i in worst:

        # making query of the stock from test data, getting actual performance, multiplying it with -1 (return is inverse to performance when shorting) and adding it to list
        query = test.query('SecuritiesCode == @i').reset_index()
        query = query.sort_values('Date')
        performance = get_performance(query,False)
        short = performance * -1
        chosen.append(short)

    # calculating return of basket of recommended stocks and absolute return given invested amount
    pct_return = np.mean(chosen)
    abs_return = amount * pct_return

    # returning percentage and absolute return
    return pct_return, abs_return

In [10]:
def eval(test, predictions, amount, number):

    '''Input: test data of stocks, predictions of stocks, amount of money to invest, number of stocks to buy/short
    Output: Percentage and absolute return of an equally weighted ETF of all predicted stocks and of the stocks recommended by the model and the difference between ETF and model'''
    
    # getting returns of model and ETF, calculating starting and end value of invested money
    pct_model, abs_model = model_performance(test, predictions, amount, number)
    model_value = amount + abs_model
    pct_etf, abs_etf = etf_performance(test, predictions, amount)
    etf_value = amount + abs_etf

    # calculating difference of model and ETF
    pct_difference = (pct_model - pct_etf) / pct_etf * 100
    abs_difference = abs_model - abs_etf
    
    # printing results
    print('Model Performance: ', pct_model*100)
    print('entry_value: ', amount, 'end_value: ', model_value)

    print('ETF Performance: ', pct_etf*100)
    print('entry_value: ', amount, 'end_value: ', etf_value)

    print('The percentage difference is ', pct_difference , '%')
    print('The absolute difference is ', abs_difference, ' JPY')

In [11]:
def list_mean(a, b):

    '''Input: two lists
    Output: mean of the two lists'''

    result = [(g + h) / 2 for g, h in zip(a, b)]
    return result

In [12]:
def prophet(train, c_range, interval, scale):

    '''training data on one stock, different hyperparamteters
    Output: mape of the model, model'''
    
    # getting columns and from training data and renaming them
    train = train[['Date', 'ad_Close']]
    train.columns = ['ds', 'y']

    # creating model and fitting it to train data
    m = Prophet(growth='linear', changepoint_range= c_range, interval_width=interval, changepoint_prior_scale=scale,  uncertainty_samples=1000, yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=False).add_seasonality(name = 'yearly', period=245, fourier_order=12).add_seasonality(name='monthly', period=24, fourier_order=4).add_seasonality(name='weekly', period=5, fourier_order=5)
    model = m.fit(train, verbose=0)
    
    # creating dataframe of predictions for only train data
    future = model.make_future_dataframe(periods=0)
    forecast = model.predict(future)
    
    # getting the middle prediction
    mid = forecast.yhat.to_list()
   
    # making sure predictions and actual values have same length
    if len(mid) == len(train.y):
        mape_mid = mape(train.y, mid)
        
        # returning mape of model and model
        return mape_mid, model

In [13]:
def grid_search(train, c_range_grid, interval_grid, scale_grid):

    '''Input: training data on different stocks, grids on different hyperparameters of FB prophet
    Ouput: model for each stock, dataframe of each stock and score of the model'''

    # setting grids for hyperparameters
    a = c_range_grid
    b = interval_grid
    c = scale_grid

    # creating data frame and list for results
    paths_df = pd.DataFrame()
    results = []

    # looping through stocks
    for i in tqdm(train.SecuritiesCode.unique()):

        # creating train data for stock
        code_train = train.query('SecuritiesCode == @i')
        
        # making lists for scores and models of stock
        scores = []
        models = []

        # looping through parameter values
        for s in a:
            for m in b:
                for x in c:

                    # setting parameter values
                    c_range = s
                    interval = m
                    scale = x
                    
                    # getting score and model by calling prophet function
                    score, model = prophet(code_train, s, m, x)

                    # adding score and model to lists
                    scores.append(score)
                    models.append(model)

        # getting best score and adding it to list, its index and best model through the index
        best_score = np.min(scores)
        results.append(best_score)
        indx = scores.index(best_score)
        best_model = models[indx]

        # saving model
        model_name = 'models/prophet_' + str(i) + '.pkl'
        pickle.dump(best_model, open(model_name, 'wb'))
           
        # adding results to dataframe   
        df_code = pd.DataFrame()
        df_code['SecuritiesCode'] = [i]
        df_code['score'] = [best_score]
        paths_df = pd.concat([paths_df, df_code])
    
    result = np.mean(results)
    paths_df = paths_df.reset_index()

    # returning dataframe
    return paths_df

In [14]:
def get_prediction(code, days):

    '''Input: SecuritiesCode of a stock, number of days to be predicted
    Output: Predictions of the given timeframe'''

    # loading model for stock
    model = pickle.load(open(f'models/prophet_{code}.pkl', 'rb'))

    # making dataframe with predictions for certain timeframe
    future = model.make_future_dataframe(periods=days)
    forecast = model.predict(future)

    # getting different predictions from dataframe
    lower = forecast.yhat_lower.tail(days).to_list()
    upper = forecast.yhat_upper.tail(days).to_list()
    mid = forecast.yhat.tail(days).to_list()
    mid_lower = list_mean(lower, mid)
    mid_upper = list_mean(upper, mid)

    # returning predictions
    return lower, mid, upper
    

In [15]:
def make_predictions(test, days):

    '''Input: test data, number of days to be predicted
    Output: Dataframe of predictions of all stocks in the test data'''

    # creating dataframe for predictions and date index
    predictions_df = pd.DataFrame()
    date_index = test.Date.unique()

    # looping through stocks
    for i in test.SecuritiesCode.unique():

        # getting predictions by calling get_prediction function
        lower, mid, upper = get_prediction(i, days)

        # making predictions dataframe for stock and adding it to large dataframe
        df_code = pd.DataFrame({'Date': date_index, 'SecuritiesCode': i, 'ad_Close_lower': lower, 'ad_Close': mid, 'ad_Close_upper': upper})
        predictions_df = pd.concat([predictions_df, df_code])

        # saving dataframe
        predictions_df.to_csv('prophet_predictions/predictions.csv')

    # returning dataframe
    return predictions_df


In [22]:
def model_mape(test, predictions):
    
    # creating dataframe for performances
    mape_performance = pd.DataFrame()

    # looping through stocks
    for i in test.SecuritiesCode.unique():

        # getting test data and predictions for stock
        test_query = test.query('SecuritiesCode == @i')
        pred_query = predictions.query('SecuritiesCode == @i')

        # getting price columns
        test_price = test_query.ad_Close.to_list()
        pred_price_mid = pred_query.ad_Close.to_list()

        # calculating mape scores over different time periods
        mape_24 = mape(test_price, pred_price_mid)
        mape_10 = mape(test_price[:10], pred_price_mid[:10])
        mape_3 = mape(test_price[:3], pred_price_mid[:3])
        mape_1 = mape(test_price[:1], pred_price_mid[:1])
        mape_end = mape(test_price[-1:], pred_price_mid[-1:])

        # creating performance dataframe for stock and adding it to large dataframe
        df_code = pd.DataFrame({'SecuritiesCode': [i], 'mape_24': [mape_24], 'mape_10': [mape_10], 'mape_3': [mape_3], 'mape_1': [mape_1], 'mape_end': mape_end})
        mape_performance = pd.concat([mape_performance, df_code])

    # returning dataframe
    return mape_performance

In [17]:
def make_models(train, industry, c_range_grid, interval_grid, scale_grid):

    '''Input: train data of several stocks, industry of the stocks to be modelled, grids on different hyperparameters of FB prophet
    Ouput: models for each stock in the industry, dataframe of each stock and the score of the model'''

    # getting train data for stocks of an industry
    train_industry = train.query('Sector == @industry')

    # getting result dataframe by calling grid_search function and saving it
    paths_df = grid_search(train_industry, c_range_grid, interval_grid, scale_grid)
    paths_df.to_csv(f'paths/{industry}.csv')

In [18]:
eval(test_wo_errors, prophet_predictions, 10000, 100)

Model Performance:  0.6372685702894354
entry_value:  10000 end_value:  10063.726857028943
ETF Performance:  -3.9961831885378696
entry_value:  10000 end_value:  9600.381681146213
The percentage difference is  -115.94693086436311 %
The absolute difference is  463.3451758827305  JPY
