In [None]:
# -*- coding: utf-8 -*-

'''
#################### CONNECT TO THE FOLDER ####################
'''
def connect_to_the_folder(method):
    import os
    if method == 'google':
        from google.colab import drive
        drive.mount('/content/drive/', force_remount=True)
        os.chdir('/content/drive/My Drive/Data_Technology/DT_Learning/III_Big_Data/BDSE21_Team1/Final_Project/')
    elif method == 'local':
        os.chdir('c:\Shared\BDSE_Team1\Final_Project\\')

'''
#################### GET FIRST N POPULAR CODE ####################
'''
def get_first_n_popular_code(n):
    import pandas as pd
    import datetime
    end_year = datetime.datetime.now().year
    years = pd.Series(pd.date_range('2016', end=str(end_year + 1), freq='Y')).dt.year

    df = []

    for year in years:
        filename = f'Dataset_A/News_Anue_Arrange/News_Anue_Arrange_{year}.csv'
        df_read = pd.read_csv(filename, index_col=0, dtype=str)
        for code in df_read['code']:
            df.append(str(code))
    
    df = pd.DataFrame(df)
    df = df.groupby(df.columns.tolist(), as_index=False).size()
    df = df.sort_values(by='size', ascending=False)
    df = df.reset_index(drop=True)
    df = df.rename(columns={0:'code'})
    df = df[:n]
    return df


'''
#################### TRAINING MODEL - RNN ####################
'''
def create_training_data_all(n, y_label, start_year, day):
    import pandas as pd
    codes = get_first_n_popular_code(n)['code']
    df_all = pd.DataFrame([])
    for code in codes:
        filename = f'Dataset_A/Training_Data_Stock/{str(code)}_training.csv'
        df = pd.read_csv(filename, index_col=0)
        df = df[df.index > start_year]
        df = df.dropna()
        # df = df[['open', 'high', 'low', 'close', f'1days_before_{y_label}', f'2days_before_{y_label}', f'3days_before_{y_label}', f'4days_before_{y_label}', f'5days_before_{y_label}', f'6days_before_{y_label}', f'y_label_{y_label}_{day}days_after']]
        df = df[['open', 'high', 'low', 'close', f'1days_before_{y_label}', f'2days_before_{y_label}', f'3days_before_{y_label}', f'4days_before_{y_label}', f'5days_before_{y_label}', f'nlp_lstm', f'y_label_{y_label}_{day}days_after']]
        # df = df[['close', f'1days_before_{y_label}', f'2days_before_{y_label}', f'3days_before_{y_label}', f'4days_before_{y_label}', f'5days_before_{y_label}', f'6days_before_{y_label}', f'7days_before_{y_label}', f'8days_before_{y_label}', f'9days_before_{y_label}', f'y_label_{y_label}_{day}days_after']]
        df_all = df_all.append(df)
    df = df_all.sample(frac=1)
    return df

def train_each_model_rnn(df, split, epochs, filename):
    import pandas as pd
    import numpy as np
    
    data_all = pd.DataFrame(df.values.flatten())
    data_all = np.array(data_all).astype(float)

    # Standardization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data_all = scaler.fit_transform(data_all)

    # train_x, train_y, test_x, test_y
    data = []
    sequence_length = len(df.columns) # Feature Number
    for i in range(int(len(data_all) / sequence_length)):
        data.append(data_all[i*sequence_length:(i+1)*sequence_length])
    reshaped_data = np.array(data).astype('float64')
    x = reshaped_data[:, :-1]
    y = reshaped_data[:, -1]

    split_boundary = int(reshaped_data.shape[0] * split)
    train_x = x[: split_boundary]
    test_x = x[split_boundary:]
    train_y = y[: split_boundary]
    test_y = y[split_boundary:]

    # Train
    from keras.models import Sequential
    from keras.layers import LSTM, Dense
    model = Sequential()
    model.add(LSTM(batch_size=None,
                   input_shape=(10,1),
                   units=256,
                   unroll=False))
    model.add(Dense(units=1))
    model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])
    history = model.fit(train_x, train_y, batch_size=100, epochs=epochs, validation_split=0.1, verbose=2)

    # Save model and detail
    model.save(filename + '.h5')

    # Validate
    import tensorflow as tf
    predict = model.predict(test_x)
    predict = np.reshape(predict, (predict.size, ))
    predict = scaler.inverse_transform([[i] for i in predict])
    text_y = scaler.inverse_transform(test_y)

    import matplotlib.pyplot as plt
    import math
    plt.plot(text_y, 'r-', label='real')
    plt.plot(predict, 'b:', label='pred')
    plt.legend(['predict', 'realdata'], loc='best')
    plt.ylim(0, math.ceil(max(np.max(predict), np.max(test_y)) * 10) / 10)
    plt.savefig(filename + '_pred.png')
    plt.show()
    plt.close()

    epochs_range = range(epochs)
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    history = [acc, val_acc, loss, val_loss]
    history = pd.DataFrame(history)
    history.to_csv(filename + '.csv')

    plt.figure(figsize=(16, 9))
    plt.suptitle(filename)

    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='upper left')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper left')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.savefig(filename + '_detail.png')
    plt.close()

def train_model_rnn(n, y_labels, start_year, days, split, epochs):
    for y_label in y_labels:
        for day in days:
            filename = f'Model/Model_Save/rnn_5_nlp/rnn_nlp_{y_label}_{day}_{epochs}'
            df = create_training_data_all(n, y_label, start_year, day)
            train_each_model_rnn(df, split, epochs, filename)
            print(filename, '--- complete ---')

if __name__ == '__main__':
    connect_to_the_folder('google') # connect to google or local
    # y_labels = ['close', 'increase']
    # days = [1, 2, 3, 4, 5]
    y_labels = ['close']
    days = [1, 2, 3, 4, 5]
    n, start_year, split, epochs = 50, '2011', 0.8, 300
    train_model_rnn(n, y_labels, start_year, days, split, epochs)
