In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime

%config IPCompleter.greedy=True 
%matplotlib inline  

In [31]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# Build Train Feature File

In [32]:
def CleanFeatureData(df):
    df["datetime"]=pd.to_datetime(df["Date/Time"],format='%d/%m/%Y %H:%M:%S',dayfirst=True)
    df.drop(columns=['Ticker','Date/Time'],inplace=True)
    df.set_index('datetime',inplace=True)
    df.dropna(inplace=True)

In [33]:
def InitLabelDF(indexItem,colLabelName,defaultVal):

 df_label=pd.DataFrame(columns= [colLabelName],index=indexItem)
 df_label[colLabelName]=defaultVal

 return df_label

In [34]:
def BuildLabelData(sourceFile,labelName,datetime_format):
 df=pd.read_csv(sourceFile)
 
 df["datetime"]=pd.to_datetime(df["datetime"],format=datetime_format,dayfirst=True)
 df.set_index('datetime',inplace=True)
 df.drop(columns=['open','high','low','close'],inplace=True)
 df.rename(columns={'signal':labelName},inplace=True)    

 #print(df.shape)
 #print(df[labelName].value_counts())
 return df

# Define Label and default value

In [35]:
label_1='DownLabel'
label_3='UpLabel'

xlabel='TrendLabel'

#label_2=2
default_label_val=2

In [36]:
def MarkTrend(item_row):
    
    x=item_row[xlabel]
    
    if item_row[label_1]!=0 and item_row[label_3]==0 :
     x=item_row[label_1]
    elif item_row[label_3]!=0 and item_row[label_1]==0 :
     x=item_row[label_3]
    return x



In [37]:
def IterateMarkLabel(temp_df_feature,upMarkFile,upTimeFormat,downMarkFile,downTimeFormat):


    df_label=InitLabelDF(temp_df_feature.index,xlabel,default_label_val)
    #print("df_label=",df_label.shape)
    #print(df_label.head(3))
    #print("==============================================")
    
    temp_df_up=BuildLabelData(upMarkFile,label_3,upTimeFormat)
    #print("df_up=", temp_df_up.shape)
    #print(temp_df_up.head(3))
    #print("==============================================")
    
    temp_df_down=BuildLabelData(downMarkFile,label_1,downTimeFormat)
    #print("df_down=", temp_df_down.shape)
    #print(temp_df_down.head(3))
    #print("==============================================")
    
    df_label=df_label.merge(temp_df_up,how='left',on='datetime')
    df_label=df_label.merge(temp_df_down,how='left',on='datetime')
    #print("merge df_label=", df_label.shape)
    #print(df_label.head(3))
    #print("##################################################")
    #print(df_label.tail(3))
    #return  df_label

    df_label[xlabel]=df_label.apply(MarkTrend,axis=1)
    df_label.drop(columns=[label_1,label_3],inplace=True)
    df_train=temp_df_feature.merge(df_label,how='left',on='datetime')
    
    return df_train
    

In [38]:
# for trend label
marklabelPath='datasets\\Tf15M\\NewTrain2007-2019\\ABData\\'

#source_label1='datasets\\Tf15M\\NewTrain2007-2019\\rawData\\TestS50_2019-15M_Up.csv'
#source_label2='datasets\\Tf15M\\NewTrain2007-2019\\rawData\\TestS50_2019-15M_Down.csv'

source_feature='datasets\\Tf15M\\NewTrain2007-2019\\ABData\\FeatureTrain_2007-2019_S50-15M_Trend.csv'
source_feature_prediction='datasets\\Tf15M\\NewTrain2007-2019\\ABData\\FeatureTest _2020_S50-15M_Trend.csv'

train_file='datasets\\Tf15M\\NewTrain2007-2019\\ML-Trend_S50_X-Train.csv'
test_file='datasets\\Tf15M\\NewTrain2007-2019\\ML-Trend_S50_X-Unlabeled.csv'
                                        
ab_file='datasets\\Tf15M\\NewTrain2007-2019\\AB-Trend-S50-15M_X.csv'




In [39]:
df_indy=pd.read_csv(source_feature)
CleanFeatureData(df_indy)
print(df_indy.info())


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 68292 entries, 2007-01-03 09:45:00 to 2019-12-30 17:00:00
Data columns (total 21 columns):
open                      68292 non-null float64
high                      68292 non-null float64
low                       68292 non-null float64
close                     68292 non-null float64
indy_ma-550               68292 non-null float64
indy_ma-1100              68292 non-null float64
indy_hh-550               68292 non-null float64
indy_ll-550               68292 non-null float64
indy_mid-550              68292 non-null float64
indy_hh2-1100             68292 non-null float64
indy_ll2-1100             68292 non-null float64
indy_mid2-1100            68292 non-null float64
indy_macd110-440          68292 non-null float64
indy_signal110-440-110    68292 non-null float64
indy_hist_macd110-440     68292 non-null float64
indy_rsi25-ma20           68292 non-null float64
indy_6ATRTrail_DC-110     68292 non-null float64
cate_3trend-550_ma110  

In [40]:

markFileList=[
    [ df_indy.loc['01-2007':'12-2008',:],f'{marklabelPath}TestS50_2006-2008-15M_Up.csv','%d/%m/%Y %H:%M:%S',f'{marklabelPath}TestS50_2006-2008-15M_Down.csv','%d/%m/%Y %H:%M:%S']
    ,[ df_indy.loc['01-2009':'10-2011'],f'{marklabelPath}TestS50_2009-2011-15M_Up.csv','%d/%m/%Y %H:%M:%S',f'{marklabelPath}TestS50_2009-2011-15M_Down.csv','%d/%m/%Y %H:%M:%S']
    
   ,[ df_indy.loc['11-2011':'12-2013'],f'{marklabelPath}TestS50_2012-2013-15M_Up.csv','%d/%m/%Y %H:%M:%S',f'{marklabelPath}TestS50_2012-2013-15M_Down.csv','%d/%m/%Y %H:%M:%S']
   ,[ df_indy.loc['01-2014':'12-2015'],f'{marklabelPath}TestS50_2014-2015-15M_Up.csv','%d/%m/%Y %H:%M:%S',f'{marklabelPath}TestS50_2014-2015-15M_Down.csv','%d/%m/%Y %H:%M:%S']
     
    ,[ df_indy.loc['01-2016':'06-2017'],f'{marklabelPath}TestS50_2016-1H2017-15M_Up.csv','%d/%m/%Y %H:%M',f'{marklabelPath}TestS50_2016-1H2017-15M_Down.csv','%d/%m/%Y %H:%M:%S']
    ,[df_indy.loc['07-2017':'12-2018'],f'{marklabelPath}TestS50_2H2017-2018-15M_Up.csv','%d/%m/%Y %H:%M:%S',f'{marklabelPath}TestS50_2H2017-2018-15M_Down.csv','%d/%m/%Y %H:%M:%S']

    ,[ df_indy.loc['01-2019':'12-2019'],f'{marklabelPath}TestS50_2019-15M_Up.csv','%d/%m/%Y %H:%M:%S',f'{marklabelPath}TestS50_2019-15M_Down.csv','%d/%m/%Y %H:%M:%S']

]
print("no.markFile",len(markFileList))

no.markFile 7


In [41]:
listDFTrain=[]
for  mark in  markFileList:
     df_subFeature=mark[0]
     upPath=mark[1]
     timeformatUp=mark[2]
     downPath=mark[3]
     timeformatDown=mark[4]
     xdf_train=IterateMarkLabel(df_subFeature,upPath,timeformatUp,downPath,timeformatDown)
     listDFTrain.append(xdf_train)



In [42]:
trainDF= pd.concat(listDFTrain)
print(trainDF.shape)
display('trainDF.head(3)','trainDF.tail(3)')


(68292, 22)


Unnamed: 0_level_0,open,high,low,close,indy_ma-550,indy_ma-1100,indy_hh-550,indy_ll-550,indy_mid-550,indy_hh2-1100,...,indy_macd110-440,indy_signal110-440-110,indy_hist_macd110-440,indy_rsi25-ma20,indy_6ATRTrail_DC-110,cate_3trend-550_ma110,cate_2trend-1100_ma220,cate_rannkHL1100-ma66,cate_CombineTrend,TrendLabel
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-01-03 09:45:00,460.0,460.5,458.0,458.0,503.7,508.75,527.0,415.0,471.0,528.2,...,-22.5,-27.2,4.7,39.11,473.92,1,0,3,2,2
2007-01-03 10:00:00,458.0,464.5,457.6,463.9,503.62,508.72,527.0,415.0,471.0,528.2,...,-22.5,-27.19,4.68,39.05,473.64,1,0,3,2,2
2007-01-03 10:15:00,463.9,464.0,459.2,460.0,503.52,508.68,527.0,415.0,471.0,528.2,...,-22.53,-27.17,4.64,38.91,473.48,1,0,3,2,2

Unnamed: 0_level_0,open,high,low,close,indy_ma-550,indy_ma-1100,indy_hh-550,indy_ll-550,indy_mid-550,indy_hh2-1100,...,indy_macd110-440,indy_signal110-440-110,indy_hist_macd110-440,indy_rsi25-ma20,indy_6ATRTrail_DC-110,cate_3trend-550_ma110,cate_2trend-1100_ma220,cate_rannkHL1100-ma66,cate_CombineTrend,TrendLabel
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-12-30 16:30:00,1067.6,1067.8,1066.8,1067.6,1066.04,1074.37,1091.5,1040.5,1066.0,1105.5,...,8.76,4.4,4.36,45.61,1066.36,1,0,2,4,2
2019-12-30 16:45:00,1067.5,1067.6,1066.5,1067.0,1066.03,1074.36,1091.5,1040.5,1066.0,1105.5,...,8.76,4.51,4.25,45.44,1066.4,1,0,2,4,2
2019-12-30 17:00:00,1066.9,1066.9,1066.9,1066.9,1066.01,1074.34,1091.5,1040.5,1066.0,1105.5,...,8.76,4.62,4.14,45.28,1066.5,1,0,2,4,2


# Export File to TrainML and Amibroker

In [43]:
trainDF.to_csv(train_file)

In [44]:
aux=xlabel
df_amibroker=trainDF[['open','high','low','close',aux]]
df_amibroker.head(5)
df_amibroker.to_csv(ab_file)   

# Analyze File TrainLabel

In [52]:
print(trainDF[xlabel].value_counts())
print("###############################")
print(round(trainDF[xlabel].value_counts(normalize=True)*100,0))

2    31331
3    23243
1    13718
Name: TrendLabel, dtype: int64
###############################
2    46.0
3    34.0
1    20.0
Name: TrendLabel, dtype: float64


# Modify label(Option)

try to export to ab to visualize and edit as desired

# Build Train Feature File

In [53]:
df_test=pd.read_csv(source_feature_prediction)
CleanFeatureData(df_test)
print(df_test.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1214 entries, 2020-01-02 09:45:00 to 2020-03-19 16:45:00
Data columns (total 21 columns):
open                      1214 non-null float64
high                      1214 non-null float64
low                       1214 non-null float64
close                     1214 non-null float64
indy_ma-550               1214 non-null float64
indy_ma-1100              1214 non-null float64
indy_hh-550               1214 non-null float64
indy_ll-550               1214 non-null float64
indy_mid-550              1214 non-null float64
indy_hh2-1100             1214 non-null float64
indy_ll2-1100             1214 non-null float64
indy_mid2-1100            1214 non-null float64
indy_macd110-440          1214 non-null float64
indy_signal110-440-110    1214 non-null float64
indy_hist_macd110-440     1214 non-null float64
indy_rsi25-ma20           1214 non-null float64
indy_6ATRTrail_DC-110     1214 non-null float64
cate_3trend-550_ma110     1214 non-null i

In [54]:
print("Show Null/NA values\n"),df_test.isna().sum()

df_test.dropna(inplace=True)

Show Null/NA values



In [55]:
df_train=pd.read_csv(train_file,index_col='datetime',parse_dates=['datetime'],dayfirst=True)
df_train.drop('TrendLabel',1,inplace=True)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 68292 entries, 2007-01-03 09:45:00 to 2019-12-30 17:00:00
Data columns (total 21 columns):
open                      68292 non-null float64
high                      68292 non-null float64
low                       68292 non-null float64
close                     68292 non-null float64
indy_ma-550               68292 non-null float64
indy_ma-1100              68292 non-null float64
indy_hh-550               68292 non-null float64
indy_ll-550               68292 non-null float64
indy_mid-550              68292 non-null float64
indy_hh2-1100             68292 non-null float64
indy_ll2-1100             68292 non-null float64
indy_mid2-1100            68292 non-null float64
indy_macd110-440          68292 non-null float64
indy_signal110-440-110    68292 non-null float64
indy_hist_macd110-440     68292 non-null float64
indy_rsi25-ma20           68292 non-null float64
indy_6ATRTrail_DC-110     68292 non-null float64
cate_3trend-550_ma110  

In [56]:
if df_test.columns.tolist()==df_train.columns.tolist() :
    df_test.to_csv(test_file)