In [1]:
import math
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import L2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanAbsoluteError, MeanAbsolutePercentageError, RootMeanSquaredError, MeanSquaredError

import util
import tensorflow.keras.backend as kb

In [23]:
def generate_dummy_dates(num_days, num_stocks, start = '12/06/2021'):
    date_index = pd.date_range(start, periods=num_days)
    dates = []
    for day in date_index:
        dates = dates + [day]*num_stocks
        
    return pd.DataFrame(dates, columns = ['Date'])

In [24]:
dummy_dates = generate_dummy_dates(118, 2000, start = '12/06/2021')

In [2]:
def run_MLP(xfile, yfile, sec_code_file, window=5):
    '''
    Input:
        xfile: (str) path to dataset input feature
        yfile: (str) path to dataset target
        sec_code_file: (str) path to the file where stock security numbers are stored
        window (str): (int) lenght of time window
    Output:
        yPr: (np array) test days*number of stocks
        sec_code_list: (np array of str) security numbers of the stocks
    '''
    offset = window-1
    xTr, yTr, sec_code_list = util.dataloader(xfile, yfile, sec_code_file)
    num_stock = len(sec_code_list[0])
    ema5Tr, ema10Tr, ema20Tr = util.MovingAverage(xTr)
    xTr, yTr, xTe, yTe = util.FormTimeWindow(xTr, yTr, window)
    xTr = np.concatenate((xTr, ema5Tr[offset:1080+offset], ema10Tr[offset:1080+offset], ema20Tr[offset:1080+offset]), axis=1)
    xTe = np.concatenate((xTe, ema5Tr[1080+offset:], ema10Tr[1080+offset:], ema20Tr[1080+offset:]), axis=1)

    print(xTr.shape, yTr.shape, xTe.shape, yTe.shape, sec_code_list.shape)

    #mlp__first_layer_nodes = [xTr.shape[1]*2],
    model = util.mlp_model(input_dims=xTr.shape[1],
                    output_dims=yTr.shape[1],
                    n_layers=10,
                    first_layer_nodes=min(num_stock*4*window,10000),
                    last_layer_nodes=num_stock*2,
                    activation_func='relu',
                    loss_func='binary_crossentropy')
    #print(model.summary())
    history = model.fit(xTr[:960], yTr[:960], batch_size = 50, epochs = 2, verbose = 1, validation_data = (xTr[960:], yTr[960:]), shuffle=True)
    yPr = model.predict(xTe)

    kb.clear_session()
    return yPr, sec_code_list[0]

In [25]:
def submission_formatting(test_dates,rank):
    '''
    Input:
        test_dates: (dataframe) (2000*number of testing days,1)
        rank: (np array of str) (number of testing days, 2000)
    Output:
        submission: (dataframe) (2000*number of testing days,3)
            submission['Date']
            submission['SecuritiesCode']
            submission['Rank']
    '''
    num_days = len(rank)
    Dates = []
    SecuritiesCode = []
    Rank = []
    rank_default = [i for i in range(2000)]
    for day in range(num_days):
        rank_of_day = rank[day].tolist()
        SecuritiesCode = SecuritiesCode + rank_of_day
        Rank = Rank + rank_default
        #date = test_dates.iloc[day*2000:(day+1)*2000].values.tolist()
        #Dates = Dates+date
    test_dates['SecuritiesCode'] = SecuritiesCode
    test_dates['Rank'] = Rank
    #submission = pd.DataFrame({'Date':test_dates, 'SecuritiesCode': SecuritiesCode, 'Rank': Rank})
    
    return test_dates

In [3]:
if __name__ == "__main__":
    
    path = ""
    path_sec = "sector mapping/"
    sectors = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17']
    #sectors = ['2','11']
    relt, rest_list = None, None
    for sec in sectors:
        xfile = path+'train_x/sector_x_'+sec+'.csv'
        yfile = path+'train_y/sector_y_'+sec+'.csv'
        sec_code_file = path_sec+sec+".csv"
        yPr, sec_code_list = run_MLP(xfile, yfile, sec_code_file, window=5)

        #print(yPr.shape, len(sec_code_list))
        #print(sec_code_list)
        output, rest = util.FindPair(yPr, sec_code_list)
        if relt is None:
            relt = output
        else:
            relt = np.concatenate((relt,output), axis=1)
        
        if rest is None: continue
        if rest_list is None:
            rest_list = rest
        else:
            rest_list = np.concatenate((rest_list, rest), axis=1)
   
    rank = util.WholeRank(relt, rest_list)
    rank_df = pd.DataFrame(rank)
    rank_df.to_csv('test_result_all_sec_Jun3.csv', index=False, header=False)

(1202, 94) (1202, 94)
(1080, 752) (1080, 94) (118, 752) (118, 94) (1, 94)
[1880, 1692, 1504, 1316, 1128, 940, 752, 564, 376, 188]
Epoch 1/2
Epoch 2/2
(118, 94) 94
['1301' '1332' '1333' '1375' '1376' '1377' '1379' '1381' '2001' '2002'
 '2003' '2004' '2009' '2053' '2060' '2108' '2109' '2114' '2117' '2201'
 '2204' '2206' '2207' '2208' '2209' '2211' '2212' '2217' '2220' '2221'
 '2222' '2226' '2229' '2264' '2266' '2267' '2268' '2269' '2270' '2281'
 '2282' '2288' '2292' '2294' '2296' '2501' '2502' '2503' '2531' '2533'
 '2540' '2573' '2579' '2587' '2588' '2590' '2593' '2594' '2602' '2607'
 '2612' '2613' '2801' '2802' '2804' '2805' '2806' '2809' '2810' '2811'
 '2814' '2815' '2819' '2830' '2831' '2871' '2875' '2882' '2884' '2897'
 '2899' '2904' '2908' '2910' '2914' '2915' '2918' '2922' '2923' '2925'
 '2929' '2931' '2932' '4526']
(1202, 14) (1202, 14)
(1080, 112) (1080, 14) (118, 112) (118, 14) (1, 14)
[280, 252, 224, 196, 168, 140, 112, 84, 56, 28]
Epoch 1/2
Epoch 2/2
(118, 14) 14
['1515' '1518

In [6]:
print(len(rank))

118


In [26]:
submission = submission_formatting(dummy_dates,rank)

In [27]:
submission.to_csv('submission_0604.csv', index = False)