In [None]:
pip install yfinance

In [None]:
url = 'https://launchpad.net/~mario-mariomedina/+archive/ubuntu/talib/+files'
!wget $url/libta-lib0_0.4.0-oneiric1_amd64.deb -qO libta.deb
!wget $url/ta-lib0-dev_0.4.0-oneiric1_amd64.deb -qO ta.deb
!dpkg -i libta.deb ta.deb
!pip install ta-lib

In [None]:
pip install anuecrawler

In [None]:
# -*- coding: utf-8 -*-

'''
#################### CONNECT TO THE FOLDER ####################
'''
def connect_to_the_folder(method):
    import os
    if method == 'google':
        from google.colab import drive
        drive.mount('/content/drive/', force_remount=True)
        os.chdir('/content/drive/My Drive/Data_Technology/DT_Learning/III_Big_Data/BDSE21_Team1/Final_Project/')
    elif method == 'local':
        os.chdir('c:\Shared\BDSE_Team1\Final_Project\\')

'''
#################### GET STOCK DATA FROM YFINANCE ####################
'''
def get_stock_data_from_yfinance_by_code(code):
    from datetime import date
    today = str(date.today().strftime("%Y-%m-%d"))

    import yfinance as yf
    df = yf.Ticker(str(code)+'.TW').history(start='2010-01-01', end=today)
    if len(df) == 0:
        df = yf.Ticker(str(code)+'.TWO').history(start='2010-01-01', end=today)
        if len(df) == 0:
            pass
    
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
    df = df.rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"})
    df.index.names = ['date']

    # Save to csv
    filename = f'Dataset_A/Stock_Data/{str(code)}.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def get_stock_index_data_from_yfinance():
    from datetime import date
    today = str(date.today().strftime("%Y-%m-%d"))

    import yfinance as yf
    for code in ['TWII', 'TFNI']:
        df = yf.Ticker('^' + code).history(start='2010-01-01', end=today)
        df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
        df = df.rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"})
        df.index.names = ['date']

        # Save to csv
        filename = f'Dataset_A/Stock_Data/{str(code)}.csv'
        df.to_csv(filename)
        print(filename, '--- complete ---')

def get_stock_data_from_yfinance():
    import pandas as pd
    filename = f'Dataset_A/Code_List/Code_List.csv'
    code_list = pd.read_csv(filename, index_col=0, dtype=str)
    code_list = code_list['code_list']

    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=100) as executor:
        future = {executor.submit(get_stock_data_from_yfinance_by_code, code): code for code in code_list}
    
    get_stock_index_data_from_yfinance()
    print('-------------------- GOT STOCK DATA FROM YFINANCE --------------------')

'''
#################### TALIB ####################
'''
def get_stock_data_talib_by_code(code):
    import pandas as pd
    import talib
    from talib import abstract
    filename = f'Dataset_A/Stock_Data/{str(code)}.csv'
    df = pd.read_csv(filename, index_col=0, dtype=str)

    # Talib
    df['periods'] = 5 # add the periods to generate the column
    ta_list = talib.get_functions()
    for x in ta_list:
        try:
            output = eval('abstract.'+x+'(df)')
            output.name = x.lower() if type(output) == pd.core.series.Series else None
            df = pd.merge(df, pd.DataFrame(output), left_on = df.index, right_on = output.index)
            df = df.set_index('key_0')
        except:
            pass
    df.index.names = ['date']

    # Save to csv
    filename = f'Dataset_A/Stock_Data_Talib/{str(code)}_talib.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def get_stock_data_talib():
    import pandas as pd
    filename = f'Dataset_A/Code_List/Code_List.csv'
    code_list = pd.read_csv(filename, index_col=0, dtype=str)
    code_list = code_list['code_list']

    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=10) as executor:
        future = {executor.submit(get_stock_data_talib_by_code, code): code for code in code_list}
    
    for code in ['TWII', 'TFNI']:
        get_stock_data_talib_by_code(code)
    
    print('-------------------- GOT STOCK DATA TALIB --------------------')

'''
#################### TECHNICAL ANALYSIS INDICATORS ####################
'''
def get_stock_data_talib_tech_by_code(df_twii, df_tfni, code):
    import pandas as pd
    filename = f'Dataset_A/Stock_Data_Talib/{str(code)}_talib.csv'
    df = pd.read_csv(filename, index_col=0)

    for day in range(20):
        df[f'{day+1}days_before_close'] = df['close'].shift(periods=day+1)

    for day in range(20):
        increase = (df['close'].shift(periods=day) - df['close'].shift(periods=day+1)) / df['close'].shift(periods=day+1)
        df[f'{day+1}days_before_increase'] = increase * 100

    for days in [1, 5, 10, 20, 60]:
        df_increase = (df['close'][days:] - df['close'].shift(periods=days)) / df['close'].shift(periods=days)
        df_twii_increase = (df_twii['close'][days:] - df_twii['close'].shift(periods=days)) / df_twii['close'].shift(periods=days)
        df_tfni_increase = (df_tfni['close'][days:] - df_tfni['close'].shift(periods=days)) / df_tfni['close'].shift(periods=days)
        df[f'{days}days_self'] = df_increase
        df[f'{days}days_twii'] = df_increase - df_twii_increase
        df[f'{days}days_tfni'] = df_increase - df_tfni_increase

    # Save to csv
    filename = f'Dataset_A/Stock_Data_Talib_Tech/{str(code)}_talib_tech.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def get_stock_data_talib_tech():
    import pandas as pd
    filename = f'Dataset_A/Code_List/Code_List.csv'
    code_list = pd.read_csv(filename, index_col=0, dtype=str)
    code_list = code_list['code_list']

    df_twii = pd.read_csv(f'Dataset_A/Stock_Data/TWII.csv', index_col=0)
    df_tfni = pd.read_csv(f'Dataset_A/Stock_Data/TFNI.csv', index_col=0)
    
    for code in code_list:
        try:
            get_stock_data_talib_tech_by_code(df_twii, df_tfni, code)
        except:
            pass

    # from concurrent.futures import ThreadPoolExecutor, as_completed
    # with ThreadPoolExecutor(max_workers=100) as executor:
    #     future = {executor.submit(get_stock_data_talib_tech_by_code, df_twii, df_tfni, code): code for code in code_list}
    
    print('-------------------- GOT STOCK DATA TALIB TECH --------------------')

'''
#################### NLP ####################
'''
def get_stock_data_talib_tech_nlp_by_code(df_nlp, code):
    import pandas as pd
    filename = f'Dataset_A/Stock_Data_Talib_Tech/{str(code)}_talib_tech.csv'
    df = pd.read_csv(filename, index_col=0)

    df_nlp_code = df_nlp[df_nlp['code'] == code]
    df_nlp_code = df_nlp_code[['nlp_bert', 'nlp_lstm']]
    df = pd.concat([df, df_nlp_code], axis=1)
    df['nlp_bert'] = df['nlp_bert'].fillna(-1)
    df['nlp_bert'] = df['nlp_bert'].astype('int32')
    df['nlp_bert'] = df['nlp_bert'].replace([-1, 0],[0, -1])
    df['nlp_lstm'] = df['nlp_lstm'].fillna(0)
    df = df.dropna(subset=['close'])
    df.index.names = ['date']

    # Save to csv
    filename = f'Dataset_A/Stock_Data_Talib_Tech_NLP/{str(code)}_talib_tech_nlp.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def get_stock_data_talib_tech_nlp():
    import pandas as pd
    filename = f'Dataset_A/Code_List/Code_List.csv'
    code_list = pd.read_csv(filename, index_col=0, dtype=str)
    code_list = code_list['code_list']

    filename = f'Dataset_A/News_Anue_Arrange_NLP/News_Anue_Arrange_NLP_LSTM.csv'
    df_nlp = pd.read_csv(filename, index_col=0)

    for code in code_list:
        try:
            get_stock_data_talib_tech_nlp_by_code(df_nlp, code)
        except:
            pass
    
    # from concurrent.futures import ThreadPoolExecutor, as_completed
    # with ThreadPoolExecutor(max_workers=100) as executor:
    #     future = {executor.submit(get_stock_data_talib_tech_nlp_by_code, df_nlp, code): code for code in code_list}
    
    print('-------------------- GOT STOCK DATA TALIB TECH NLP --------------------')

'''
#################### TRAINING DATA STOCK ####################
'''
def get_training_data_stock_by_code(code):
    import pandas as pd
    filename = f'Dataset_A/Stock_Data_Talib_Tech_NLP/{str(code)}_talib_tech_nlp.csv'
    df = pd.read_csv(filename, index_col=0)
    df = df.drop(columns=['periods', 'acos', 'asin', 'cosh', 'exp', 'sinh'])
    for day in [1,2,3,4,5]:
        df[f'y_label_close_{day}days_after'] = df['close'].shift(periods=-day)
        df[f'y_label_increase_{day}days_after'] = (df['close'].shift(periods=-day) - df['close']) / df['close']
    df = df.dropna()
    # Save to csv
    filename = f'Dataset_A/Training_Data_Stock/{str(code)}_training.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def get_training_data_stock():
    import pandas as pd
    filename = f'Dataset_A/Code_List/Code_List.csv'
    code_list = pd.read_csv(filename, index_col=0, dtype=str)
    code_list = code_list['code_list']
    
    for code in code_list:
        try:
            get_training_data_stock_by_code(code)
        except:
            pass

    # from concurrent.futures import ThreadPoolExecutor, as_completed
    # with ThreadPoolExecutor(max_workers=100) as executor:
    #     future = {executor.submit(get_training_data_stock_by_code, code): code for code in code_list}
    
    print('-------------------- GOT TRAINING DATA STOCK --------------------')

'''
#################### GET SAVE DIRECTION ####################
'''
def get_save_direction():
    import datetime
    if datetime.datetime.today().day % 2 == 0:
        dir_1 = "Dataset_A"
        dir_2 = "Dataset_B"
    else:
        dir_1 = "Dataset_B"
        dir_2 = "Dataset_A"
    return dir_1, dir_2

'''
#################### UPDATE THE DATA SET ####################
'''
def update_stock_data_by_code(dir_1, dir_2, code):
    import pandas as pd
    filename = f'{dir_1}/Stock_Data/{str(code)}.csv'
    df_old = pd.read_csv(filename, index_col=0)

    from datetime import date
    start_date = df_old.index[-1]
    today = str(date.today().strftime("%Y-%m-%d"))

    import yfinance as yf
    df = yf.Ticker(str(code)+'.TW').history(start=start_date, end=today)
    if len(df) == 0:
        df = yf.Ticker(str(code)+'.TWO').history(start=start_date, end=today)
        if len(df) == 0:
            pass
    
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
    df = df.rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"})
    df.index.names = ['date']
    df.index = df.index.strftime("%Y-%m-%d")
    df = df[1:]

    df = df_old.append(df)

    # Save to csv
    filename = f'{dir_2}/Stock_Data/{str(code)}.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def update_stock_index_data(dir_1, dir_2):
    import pandas as pd
    from datetime import date
    import yfinance as yf
    for code in ['TWII', 'TFNI']:
        filename = f'{dir_1}/Stock_Data/{str(code)}.csv'
        df_old = pd.read_csv(filename, index_col=0)

        start_date = df_old.index[-1]
        today = str(date.today().strftime("%Y-%m-%d"))

        df = yf.Ticker('^' + code).history(start=start_date, end=today)
        df = df[['Open', 'High', 'Low', 'Close', 'Volume']]
        df = df.rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"})
        df.index.names = ['date']
        df.index = df.index.strftime("%Y-%m-%d")
        df = df[1:]

        df = df_old.append(df)

        # Save to csv
        filename = f'{dir_2}/Stock_Data/{str(code)}.csv'
        df.to_csv(filename)
        print(filename, '--- complete ---')

def update_stock_data_talib_by_code(dir_1, dir_2, code):
    import pandas as pd
    filename = f'{dir_1}/Stock_Data_Talib/{str(code)}_talib.csv'
    df_old = pd.read_csv(filename, index_col=0)
    date_old = df_old.index[-70]
    date_new = df_old.index[-1]

    filename = f'{dir_2}/Stock_Data/{str(code)}.csv'
    df = pd.read_csv(filename, index_col=0)
    df = df[df.index > date_old]

    # Talib
    import talib
    from talib import abstract
    df['periods'] = 5 # add the periods to generate the column
    ta_list = talib.get_functions()
    for x in ta_list:
        try:
            output = eval('abstract.'+x+'(df)')
            output.name = x.lower() if type(output) == pd.core.series.Series else None
            df = pd.merge(df, pd.DataFrame(output), left_on = df.index, right_on = output.index)
            df = df.set_index('key_0')
        except:
            pass
    df.index.names = ['date']

    df = df[df.index > date_new]
    df = df_old.append(df)

    # Save to csv
    filename = f'{dir_2}/Stock_Data_Talib/{str(code)}_talib.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def update_stock_data_talib_tech_by_code(dir_1, dir_2, df_twii, df_tfni, code):
    import pandas as pd
    filename = f'{dir_1}/Stock_Data_Talib_Tech/{str(code)}_talib_tech.csv'
    df_old = pd.read_csv(filename, index_col=0)
    date_old = df_old.index[-70]
    date_new = df_old.index[-1]

    filename = f'{dir_2}/Stock_Data_Talib/{str(code)}_talib.csv'
    df = pd.read_csv(filename, index_col=0)
    df = df[df.index > date_old]

    for day in range(20):
        df[f'{day+1}days_before_close'] = df['close'].shift(periods=day+1)
    
    for day in range(20):
        increase = (df['close'].shift(periods=day) - df['close'].shift(periods=day+1)) / df['close'].shift(periods=day+1)
        df[f'{day+1}days_before_increase'] = increase * 100

    for days in [1, 5, 10, 20, 60]:
        df_increase = (df['close'][days:] - df['close'].shift(periods=days)) / df['close'].shift(periods=days)
        df_twii_increase = (df_twii['close'][days:] - df_twii['close'].shift(periods=days)) / df_twii['close'].shift(periods=days)
        df_tfni_increase = (df_tfni['close'][days:] - df_tfni['close'].shift(periods=days)) / df_tfni['close'].shift(periods=days)
        df[f'{days}days_self'] = df_increase
        df[f'{days}days_twii'] = df_increase - df_twii_increase
        df[f'{days}days_tfni'] = df_increase - df_tfni_increase
    
    df = df[df.index > date_new]
    df = df_old.append(df)

    # Save to csv
    filename = f'{dir_2}/Stock_Data_Talib_Tech/{str(code)}_talib_tech.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def update_stock_data_talib_tech_nlp_by_code(dir_1, dir_2, code):
    import pandas as pd
    filename = f'{dir_1}/Stock_Data_Talib_Tech_NLP/{str(code)}_talib_tech_nlp.csv'
    df_old = pd.read_csv(filename, index_col=0)
    df_old = df_old['nlp']

    filename = f'{dir_2}/Stock_Data_Talib_Tech/{str(code)}_talib_tech.csv'
    df = pd.read_csv(filename, index_col=0)

    df = pd.concat([df, df_old], axis=1)
    df['nlp'] = df['nlp'].fillna(0)

    # Save to csv
    filename = f'{dir_2}/Stock_Data_Talib_Tech_NLP/{str(code)}_talib_tech_nlp.csv'
    df.to_csv(filename)
    print(filename, '--- complete ---')

def update_the_dataset():
    dir_1, dir_2 = get_save_direction()
    print(f'-------------------- START UPDATING FROM {dir_1} TO {dir_2} --------------------')

    import pandas as pd
    import time
    filename = f'Dataset_A/Code_List/Code_List.csv'
    code_list = pd.read_csv(filename, index_col=0, dtype=str)
    code_list = code_list['code_list']

    # Stock Data
    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=100) as executor:
        future = {executor.submit(update_stock_data_by_code, dir_1, dir_2, code): code for code in code_list}
    update_stock_index_data(dir_1, dir_2)
    time.sleep(10)
    print('-------------------- STOCK DATA UP TO DATE --------------------')

    # Talib
    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=100) as executor:
        future = {executor.submit(update_stock_data_talib_by_code, dir_1, dir_2, code): code for code in code_list}
    for code in ['TWII', 'TFNI']:
        update_stock_data_talib_by_code(dir_1, dir_2, code)
    time.sleep(10)
    print('-------------------- STOCK DATA TALIB UP TO DATE --------------------')

    # Tech
    df_twii = pd.read_csv(f'{dir_2}/Stock_Data/TWII.csv', index_col=0)
    df_tfni = pd.read_csv(f'{dir_2}/Stock_Data/TFNI.csv', index_col=0)
    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=100) as executor:
        future = {executor.submit(update_stock_data_talib_tech_by_code, dir_1, dir_2, df_twii, df_tfni, code): code for code in code_list}
    time.sleep(10)
    print('-------------------- STOCK DATA TALIB TECH UP TO DATE --------------------')

    # NLP
    from concurrent.futures import ThreadPoolExecutor, as_completed
    with ThreadPoolExecutor(max_workers=100) as executor:
        future = {executor.submit(update_stock_data_talib_tech_nlp_by_code, dir_1, dir_2, code): code for code in code_list}
    print('-------------------- STOCK DATA TALIB TECH NLP UP TO DATE --------------------')

    print('-------------------- ALL DATASET UP TO DATE --------------------')

if __name__ == '__main__':
    connect_to_the_folder('google') # connect to google or local
    # get_stock_data_from_yfinance()
    # get_stock_data_talib()
    from datetime import datetime
    time1 = datetime.now()
    get_stock_data_talib_tech()
    get_stock_data_talib_tech_nlp()
    get_training_data_stock()
    time2 = datetime.now()
    print(time2 - time1)
    # update_the_dataset()