In [1]:
import pandas as pd
import numpy as np
import os

In [None]:
def load_train_and_sup_file(read_path, save_path):
    '''
    Input:
        read_path (str) where to read train/supplemental files
        save_path (str) where to save the loaded price info for every stock
    Output:
        security_code_dic (dictionary) security code -> number of trading days
    '''
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    df = pd.read_csv(read_path)
    df.drop(labels=['AdjustmentFactor','ExpectedDividend', 'SupervisionFlag'], axis=1, inplace=True)
    df.drop(labels = ['RowId', 'Open', 'High', 'Low', 'Volume']
    df.sort_values(['SecuritiesCode', 'Date'], inplace = True)
    
    security_code_list = {}
    longest_days = 0
    length = df.shape[0]
    start = 0
    end = 0
    security_code = df.iloc[start]['SecuritiesCode']
    for i in range(length):
        if df.iloc[i]['SecuritiesCode']!=security_code:
            end = i
            security_code_list[security_code] = end-start
            temp_df = pd.DataFrame(df.iloc[start:end])
            temp_df.to_csv(save_path+str(security_code)+'.csv', index = False)
            start = i
            security_code = df.iloc[start]['SecuritiesCode']

    security_code_list[security_code] = length-1-start
    security_code_list.append(security_code)
    temp_df = pd.DataFrame(df.iloc[start:])
    temp_df.to_csv(save_path+str(security_code)+'.csv', index = False)
    
    return security_code_dic

In [None]:
def feature_processing(path, security_code_dic):
    '''
    Input:
        path: (str) folder where the raw data (for each stock) are stored
    '''
    for security_code in security_code_dic:
        df = pd.read_csv(save_path+str(security_code)+'.csv')
        df['pClose'] = df['Close'].shift(1)
        df['Return_Close'] = (df['Close'] - df['pClose'])/df['pClose'] 
        df.fillna(0, inplace = True)
        df.to_csv(save_path+str(security_code)+'.csv', index=False)

In [None]:
def fill_date(file_name, template_df):
    '''
    Input:
        file_name: (str) path to the file that need to be processed
        template_df: (dataframe) longest stock info dataframe
    '''
    temp_df = template_df.copy(deep=True)
    df = pd.read_csv(file_name)
    len_df = df.shape[0]
    p1 = 0
    p2 = 0
    while(p1<temp_df.shape[0] and p2<len_df):
        if df.iloc[p2]['Date'] == temp_df.iloc[p1]['Date']:
            temp_df.iloc[p1, df.columns.get_loc('Return_Close')] = df.iloc[p2]['Return_Close']
            temp_df.iloc[p1, df.columns.get_loc('Return_Open')] = df.iloc[p2]['Return_Open']
            temp_df.iloc[p1, df.columns.get_loc('Return_High')] = df.iloc[p2]['Return_High']
            temp_df.iloc[p1, df.columns.get_loc('Return_Low')] = df.iloc[p2]['Return_Low']
            temp_df.iloc[p1, df.columns.get_loc('Target')] = df.iloc[p2]['Target']
            temp_df.iloc[p1, df.columns.get_loc('Volume')] = df.iloc[p2]['Volume']
            p1+=1
            p2+=1
        else:
            temp_df.iloc[p1, df.columns.get_loc('Return_Close')] = 0
            temp_df.iloc[p1, df.columns.get_loc('Return_Open')] = 0
            temp_df.iloc[p1, df.columns.get_loc('Return_High')] = 0
            temp_df.iloc[p1, df.columns.get_loc('Return_Low')] = 0
            temp_df.iloc[p1, df.columns.get_loc('Target')] = 0
            temp_df.iloc[p1, df.columns.get_loc('Volume')] = 0
            p1+=1
    while(p1<temp_df.shape[0]):
        temp_df.iloc[p1, df.columns.get_loc('Return_Close')] = 0
        temp_df.iloc[p1, df.columns.get_loc('Return_Open')] = 0
        temp_df.iloc[p1, df.columns.get_loc('Return_High')] = 0
        temp_df.iloc[p1, df.columns.get_loc('Return_Low')] = 0
        temp_df.iloc[p1, df.columns.get_loc('Target')] = 0
        temp_df.iloc[p1, df.columns.get_loc('Volume')] = 0
        p1+=1
    temp_df.to_csv(file_name, index = False)

In [None]:
def fill_date_batch(path, security_code_list, longest_stock):
    template_df = pd.read_csv(path+str(longest_stock)+'.csv')
    for security_code in security_code_dic:
        file_name = path+str(security_code)+'.csv'
        

In [None]:
def concate_train_and_sup_files(train_path, sup_path, security_code_dic):
    for secu_code in security_code_dic:
        train_df = pd.read_csv(train_path+str(secu_code)+'.csv')
        sup_df = pd.read_csv(sup_path+str(secu_code)+'.csv')
        

In [None]:
def preprocessing()