In [14]:
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU

In [15]:
def randomForest():
    # Import dataset
    #The code reads a csv file named "SP50.csv" using pandas and stores it in a dataframe called "bist100".
    bist100 = pd.read_csv("/content/drive/MyDrive/dp/SP500.csv")
    # Rename columns
    bist100.rename(columns={"Date": "date", "Open": "open", "High": "high", "Low": "low", "Close": "close"},
                   inplace=True)
    # Checking null value
    bist100.isnull().sum()
    # Checking na value
    bist100.isna().any()
    # The code drops any rows with null values using the "dropna()" method .
    bist100.dropna(inplace=True)
    #The code is used to check if there are any na values in the dataframe after dropping the null values.
    bist100.isna().any()
    # convert date field from string to Date format and make it index
    bist100['date'] = pd.to_datetime(bist100.date)
    # sorting dataset by date format
    bist100.sort_values(by='date', inplace=True)
    print("*******************************************************************************************")
    # Get the duration of dataset
    # The time of the first bar of data
    print("Starting date: ", bist100.iloc[0][0])
    # Time of the last piece of data
    print("Ending date: ", bist100.iloc[-1][0])
    #duration
    print("Duration: ", bist100.iloc[-1][0] - bist100.iloc[0][0])
    # Monthwise High and Low stock price
    bist100.groupby(bist100['date'].dt.strftime('%B'))['low'].min()
    #Keep close date data
    closedf = bist100[['date', 'close']]
    #Make a copy of the data for easy use
    close_stock = closedf.copy()
    #Delete date, leaving only close
    del closedf['date']
    # Maximum minimization normalization
    scaler = MinMaxScaler(feature_range=(0, 1))
    closedf = scaler.fit_transform(np.array(closedf))
    # Training data 0.65 Test data 0.35
    training_size = int(len(closedf) * 0.65)
    test_size = len(closedf) - training_size
    # Divide the data set according to the index divided above
    train_data, test_data = closedf[0:test_size, :], closedf[test_size:len(closedf), :]
    # Divide the data set according to the time window
    # Using two weeks' worth of data to predict one day's worth of data
    def create_dataset(dataset, time_step=5):
        dataX, dataY = [], []
        for i in range(len(dataset) - time_step - 1):
            a = dataset[i:(i + time_step), 0]  ###i=0, 0,1,2,3------15
            dataX.append(a)
            dataY.append(dataset[i + time_step, 0])
        return np.array(dataX), np.array(dataY)

    # Using two weeks' worth of data to predict one day's worth of data
    time_step = 10
    X_train, y_train = create_dataset(train_data, time_step)
    X_test, y_test = create_dataset(test_data, time_step)

    from sklearn.ensemble import RandomForestRegressor
    # Build a random forest model
    regressor = RandomForestRegressor(max_depth=1)
    # Training model
    regressor.fit(X_train, y_train)
    # Lets Do the prediction
    train_predict = regressor.predict(X_train)
    test_predict = regressor.predict(X_test)
    train_predict = train_predict.reshape(-1, 1)
    test_predict = test_predict.reshape(-1, 1)
    # From maximum to minimum normalization to its original form
    train_predict = scaler.inverse_transform(train_predict)
    test_predict = scaler.inverse_transform(test_predict)
    original_ytrain = scaler.inverse_transform(y_train.reshape(-1, 1))
    original_ytest = scaler.inverse_transform(y_test.reshape(-1, 1))
    # Evaluation metrices MSE
    print("Random forest MSE: ", mean_squared_error(original_ytest, test_predict))
    print("*******************************************************************************************\n\n\n\n")


In [16]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import json
import time
import math
from sklearn.ensemble import RandomForestClassifier as RFC
import numpy as np
import pandas as pd
import datetime as dt
from numpy import newaxis
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

from math import pi,sqrt,exp,pow,log
from numpy.linalg import det, inv
from abc import ABCMeta, abstractmethod
from sklearn import cluster

import statsmodels.api as sm
import scipy.stats as scs
import scipy.optimize as sco
import scipy.interpolate as sci
from scipy import stats

def singleInputLSTM():# Both the training set and the test set are one-dimensional data, the stock price
    # The training set size is 0.65
    split = (0.65)
    # The predicted sequence length is 10
    sequence_length = 10;
    # Data normalization
    normalise = True
    # Design the batch size of the neural network
    batch_size = 100;
    # The dimension of the input data is 5
    input_dim = 1
    # The length of input prediction data is 9, which means that the first 9 data are used to predict the 10th data
    input_timesteps = 9
    # Set the number of neurons for the neural network to 50
    neurons = 50
    # As the number of epochs increases, the number of weight update iterations increases, and the curve shifts from the initial unfitting state to the optimized fitting state
    epochs = 5
    # The number of predicted data at a time
    prediction_len = 1
    # dense layer Number of output data
    dense_output = 1
    # Sets the probability of data returning to zero
    drop_out = 0
    # Read the noiseless data for prediction
    dataframe = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")
    # Just use Adj Close for this column of elements
    cols = ['Adj Close']
    # Gets the number of rows in the table
    len_dataframe = dataframe.shape[0]
    # Gets the size of the test set and divides the data set
    i_split = int(len(dataframe) * split)
    data_train = dataframe.get(cols).values[:i_split]
    data_test = dataframe.get(cols).values[i_split:]
    len_train = len(data_train)
    len_test = len(data_test)
    len_train_windows = None

    # Add a test set to the sliding window
    data_windows = []
    for i in range(len_test - sequence_length):
        data_windows.append(data_test[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    # Save the initial test set for the final MSE calculation
    y_test_ori = data_windows[:, -1, [0]]
    # Create the data container required for maximum minimization
    window_data = data_windows
    # Statistics The number of sliding Windows
    win_num = window_data.shape[0]
    # Statistical column number
    col_num = window_data.shape[2]
    # Normalized data was recorded
    normalised_data = []
    # Record minimum values for calculation purposes
    record_min = []
    # Record the maximum value for calculation
    record_max = []
    # Maximum minimization
    for win_i in range(0, win_num):  # Normalization is carried out for each sliding window
        normalised_window = []
        for col_i in range(0, col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        # Normalized data was recorded
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)

    # normalised_data=window_data
    data_windows = normalised_data  # get_test_data
    # The normalized test set is obtained
    x_test = data_windows[:, :-1]
    y_test = data_windows[:, -1, [0]]

    # Adds a training set to the sliding window
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    # Create the data container required for maximum minimization
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    # Maximum minimization
    normalised_data = []
    for win_i in range(0, win_num):  # Normalization is carried out for each sliding window
        normalised_window = []
        for col_i in range(0, col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        # Normalized data was recorded
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)

    # normalised_data=window_data
    data_windows = normalised_data
    # The normalized test set is obtained
    x_train = data_windows[:, :-1]
    y_train = data_windows[:, -1, [0]]

    # Create the LSTM model
    model = Sequential()
    # Build the LSTM hierarchy
    model.add(LSTM(neurons, input_shape=(input_timesteps, input_dim), return_sequences=True))
    model.add(Dropout(drop_out))
    model.add(LSTM(neurons, return_sequences=True))
    model.add(LSTM(neurons, return_sequences=False))
    model.add(Dropout(drop_out))
    model.add(Dense(dense_output, activation='linear'))
    # Compile model
    model.compile(loss='mean_squared_error',
                  optimizer='adam')
    # Fit the model
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)
    # Use models to make predictions
    model.predict(x_test)

    # multi sequence predict
    data = x_test
    prediction_seqs = []
    # Each sliding window is predicted, and the predicted result of each sliding window is passed to the next sliding window
    window_size = sequence_length
    pre_win_num = int(len(data) / prediction_len)
    for i in range(0, pre_win_num):  # 
        curr_frame = data[i * prediction_len]
        predicted = []
        for j in range(0, prediction_len):
            # Use models to make predictions
            temp = model.predict(curr_frame[newaxis, :, :])[0]
            # Save the prediction
            predicted.append(temp)
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size - 2], predicted[-1], axis=0)
        prediction_seqs.append(predicted)
    # Initializes the container and parameters used for de-normalization
    de_predicted = []
    len_pre_win = int(len(data) / prediction_len)
    len_pre = prediction_len
    # De-normalize the predicted data, that is, remove the maximum and minimum value normalization
    m = 0
    for i in range(0, len_pre_win):
        for j in range(0, len_pre):
            de_predicted.append(prediction_seqs[i][j][0] * record_max[m] + record_min[m])
            m = m + 1
    # Initialize the container and parameters for calculating MSE
    error = []
    diff = y_test.shape[0] - prediction_len * pre_win_num
    # The error was calculated by comparing the original test set with the de-normalized forecast data
    for i in range(y_test_ori.shape[0] - diff):
        error.append(y_test_ori[i,] - de_predicted[i])
    # Calculate the MSE by error
    squaredError = []
    for val in error:
        squaredError.append(val * val)

    MSE = sum(squaredError) / len(squaredError)
    print("*****************************************************************************************************")
    print("LSTM-MSE:{}".format(MSE))
    print("*****************************************************************************************************")

In [17]:
#Experiment 1 Compare the effect of lstm and random forest prediction
randomForest()
singleInputLSTM()
#Conclusion 1: LSTM is stronger than random forest

*******************************************************************************************
Starting date:  2017-12-07 00:00:00
Ending date:  2018-06-01 00:00:00
Duration:  176 days 00:00:00
Random forest MSE:  2420.2892448760804
*******************************************************************************************




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
*****************************************************************************************************
LSTM-MSE:[2259.61468784]
*****************************************************************************************************


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, GRU
def gridSearch():
    # Import dataset
    #The code reads a csv file named "SP500.csv" using pandas and stores it in a dataframe called "bist100".
    bist100 = pd.read_csv("/content/drive/MyDrive/dp/SP500.csv")
    # Rename columns
    bist100.rename(columns={"Date": "date", "Open": "open", "High": "high", "Low": "low", "Close": "close"},
                   inplace=True)
    # Checking null value
    bist100.isnull().sum()
    # Checking na value
    bist100.isna().any()
    # The code drops any rows with null values using the "dropna()" method .
    bist100.dropna(inplace=True)
    #The code is used to check if there are any na values in the dataframe after dropping the null values.
    bist100.isna().any()
    # convert date field from string to Date format and make it index
    bist100['date'] = pd.to_datetime(bist100.date)
    # sorting dataset by date format
    bist100.sort_values(by='date', inplace=True)
    # Get the duration of dataset
    # The time of the first bar of data

    # Monthwise High and Low stock price
    bist100.groupby(bist100['date'].dt.strftime('%B'))['low'].min()
    #Keep close date data
    closedf = bist100[['date', 'close']]
    #Make a copy of the data for easy use
    close_stock = closedf.copy()
    #Delete date, leaving only close
    del closedf['date']
    # Maximum minimization normalization
    scaler = MinMaxScaler(feature_range=(0, 1))
    closedf = scaler.fit_transform(np.array(closedf))
    # Training data 0.65 Test data 0.35
    
    training_size = int(len(closedf) * 0.65)
    test_size = len(closedf) - training_size
    # Divide the data set according to the index divided above
    train_data, test_data = closedf[0:test_size, :], closedf[test_size:len(closedf), :]
    # Divide the data set according to the time window
    # Using two weeks' worth of data to predict one day's worth of data
    def create_dataset(dataset, time_step=1):
        dataX, dataY = [], []
        for i in range(len(dataset) - time_step - 1):
            a = dataset[i:(i + time_step), 0]  ###i=0, 0,1,2,3------15
            dataX.append(a)
            dataY.append(dataset[i + time_step, 0])
        return np.array(dataX), np.array(dataY)

    # Using two weeks' worth of data to predict one day's worth of data
    time_step = 10
    X_train, y_train = create_dataset(train_data, time_step)
    X_test, y_test = create_dataset(test_data, time_step)

    from sklearn.ensemble import RandomForestRegressor
############################################Use grid search to find the best parameters###############################################################################################################
    #Iterate through all parameter values in a traversal manner
    param_test1 = [{'n_estimators':[50,120,160,200,250]},{'max_depth':[1,2,3,5,7,9,11,13]},{'min_samples_split':[100,120,150,180,200,300]}]
    #Create a grid search
    gsearch1 = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_test1, cv=5)
    #Use model fit data sets
    gsearch1.fit(X_train,y_train)
    print("*******************************************************************************************")
    print( "By gridSearch The best model is :",gsearch1.best_estimator_)#gridsearch.cv_results_Print the fitting results)
    print("*******************************************************************************************\n\n\n\n")
##########################################################The grid search is complete###############################################################################################################

In [19]:
def Opt_randomForest():
    # Import dataset
    #The code reads a csv file named "SP500.csv" using pandas and stores it in a dataframe called "bist100".
    bist100 = pd.read_csv("/content/drive/MyDrive/dp/SP500.csv")
    # Rename columns
    bist100.rename(columns={"Date": "date", "Open": "open", "High": "high", "Low": "low", "Close": "close"},
                   inplace=True)
    # Checking null value
    bist100.isnull().sum()
    # Checking na value
    bist100.isna().any()
    # The code drops any rows with null values using the "dropna()" method .
    bist100.dropna(inplace=True)
    #The code is used to check if there are any na values in the dataframe after dropping the null values.
    bist100.isna().any()
    # convert date field from string to Date format and make it index
    bist100['date'] = pd.to_datetime(bist100.date)
    # sorting dataset by date format
    bist100.sort_values(by='date', inplace=True)
    # Get the duration of dataset
    # The time of the first bar of data
    print("*******************************************************************************************")
    print("Starting date: ", bist100.iloc[0][0])
    # Time of the last piece of data
    print("Ending date: ", bist100.iloc[-1][0])
    #duration
    print("Duration: ", bist100.iloc[-1][0] - bist100.iloc[0][0])
    # Monthwise High and Low stock price
    bist100.groupby(bist100['date'].dt.strftime('%B'))['low'].min()
    #Keep close date data
    closedf = bist100[['date', 'close']]
    #Make a copy of the data for easy use
    close_stock = closedf.copy()
    #Delete date, leaving only close
    del closedf['date']
    # Maximum minimization normalization
    scaler = MinMaxScaler(feature_range=(0, 1))
    closedf = scaler.fit_transform(np.array(closedf))
    # Training data 0.65 Test data 0.35
    training_size = int(len(closedf) * 0.65)
    test_size = len(closedf) - training_size
    # Divide the data set according to the index divided above
    train_data, test_data = closedf[0:test_size, :], closedf[test_size:len(closedf), :]
    # Divide the data set according to the time window
    # Using two weeks' worth of data to predict one day's worth of data
    def create_dataset(dataset, time_step=5):
        dataX, dataY = [], []
        for i in range(len(dataset) - time_step - 1):
            a = dataset[i:(i + time_step), 0]  ###i=0, 0,1,2,3------15
            dataX.append(a)
            dataY.append(dataset[i + time_step, 0])
        return np.array(dataX), np.array(dataY)

    # Using two weeks' worth of data to predict one day's worth of data
    time_step = 10
    X_train, y_train = create_dataset(train_data, time_step)
    X_test, y_test = create_dataset(test_data, time_step)

    from sklearn.ensemble import RandomForestRegressor
    # Build a random forest model
    regressor = RandomForestRegressor(max_depth=13)
    # Training model
    regressor.fit(X_train, y_train)
    # Lets Do the prediction
    train_predict = regressor.predict(X_train)
    test_predict = regressor.predict(X_test)
    train_predict = train_predict.reshape(-1, 1)
    test_predict = test_predict.reshape(-1, 1)
    # From maximum to minimum normalization to its original form
    train_predict = scaler.inverse_transform(train_predict)
    test_predict = scaler.inverse_transform(test_predict)
    original_ytrain = scaler.inverse_transform(y_train.reshape(-1, 1))
    original_ytest = scaler.inverse_transform(y_test.reshape(-1, 1))
    # Evaluation metrices MSE
    print("Random forest algorithm after grid search optimization MSE: ", mean_squared_error(original_ytest, test_predict))
    print("*******************************************************************************************\n\n\n\n")

In [20]:
gridSearch()

*******************************************************************************************
By gridSearch The best model is : RandomForestRegressor(max_depth=13)
*******************************************************************************************






In [21]:
#Common stochastic forest models predict performance
randomForest()

#Test the predictive performance of the best model, when(max_depth=11)
Opt_randomForest()

#The second conclusion is that gridded search can help provide the effect of machine learning model

*******************************************************************************************
Starting date:  2017-12-07 00:00:00
Ending date:  2018-06-01 00:00:00
Duration:  176 days 00:00:00
Random forest MSE:  2536.899516624479
*******************************************************************************************




*******************************************************************************************
Starting date:  2017-12-07 00:00:00
Ending date:  2018-06-01 00:00:00
Duration:  176 days 00:00:00
Random forest algorithm after grid search optimization MSE:  1684.5183216721784
*******************************************************************************************






In [22]:
import json
import time
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from numpy import newaxis
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from math import pi,sqrt,exp,pow,log
from numpy.linalg import det, inv
from abc import ABCMeta, abstractmethod
from sklearn import cluster
import statsmodels.api as sm 
import scipy.stats as scs
import scipy.optimize as sco
import scipy.interpolate as sci
from scipy import stats
#In order to reduce the interference of false information and increase the robustness of data, Gaussian noise is added as a training set
def add_noise():
  #Read stock price information with emotional information assignment
  df = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")

  #Calculate the variance of the sentiment analysis value for each company
  wsj_var=np.var(df.wsj_mean_compound)
  cnbc_var=np.var(df.cnbc_mean_compound)
  fortune_var=np.var(df.fortune_mean_compound)
  reuters_var=np.var(df.reuters_mean_compound)

  # The mean of the initialization probability distribution is 0
  mu=0
  # Calculate the standard deviation of the probability distribution of sentiment analysis values for each company
  sigma_wsj=0.1*wsj_var
  sigma_cnbc=0.1*cnbc_var
  sigma_fortune=0.1*fortune_var
  sigma_reuters=0.1*reuters_var
  # shape[0] means the number of rows in the matrix
  n=df.shape[0]
  # Create a new table
  df_noise=pd.DataFrame()
  # Specify the source of data for each column
  df_noise['wsj_noise']=df['wsj_mean_compound']
  df_noise['cnbc_noise']=df['cnbc_mean_compound']
  df_noise['fortune_noise']=df['fortune_mean_compound']
  df_noise['reuters_noise']=df['reuters_mean_compound']
  # Add noise to each column of data
  for i in range(0,n):
    df_noise['wsj_noise'][i]+=np.random.normal(mu,sigma_wsj)
    df_noise['cnbc_noise'][i]+=np.random.normal(mu,sigma_cnbc)
    df_noise['fortune_noise'][i]+=np.random.normal(mu,sigma_fortune)
    df_noise['reuters_noise'][i]+=np.random.normal(mu,sigma_reuters)
  # Save the data you just processed to a file
  df_noise.to_csv("/content/drive/MyDrive/dp/source_price_noise.csv")
  print("*****************************************************************************************************")
  print("Gaussian noise was successfully added to the data set")
  print("*****************************************************************************************************")

In [23]:

import json
import time
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from numpy import newaxis
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

from math import pi,sqrt,exp,pow,log
from numpy.linalg import det, inv
from abc import ABCMeta, abstractmethod
from sklearn import cluster

import statsmodels.api as sm
import scipy.stats as scs
import scipy.optimize as sco
import scipy.interpolate as sci
from scipy import stats
def twoWeekSentimentLSTM():#Training set: Gaussian noise was added to each company, and four tables were aggregated together as the training set test set: the original stock price with sentiment analysis without noise added
    # Read and process generated noise data and raw data
    df = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")
    dfn = pd.read_csv("/content/drive/MyDrive/dp/source_price_noise.csv", index_col=0)

    # Create a data table for each company. Create four tables in total

    # df_1n indicates adding noise to wsj
    df_1n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_1n['wsj'] = dfn['wsj_noise']
    df_1n['cnbc'] = df['cnbc_mean_compound']
    df_1n['fortune'] = df['fortune_mean_compound']
    df_1n['reuters'] = df['reuters_mean_compound']
    df_1n['price'] = df['Adj Close']

    # df_2n represents the cnbc company to increase noise
    df_2n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_2n['wsj'] = df['wsj_mean_compound']
    df_2n['cnbc'] = dfn['cnbc_noise']
    df_2n['fortune'] = df['fortune_mean_compound']
    df_2n['reuters'] = df['reuters_mean_compound']
    df_2n['price'] = df['Adj Close']

    # df_3n represents fortune to increase noise
    df_3n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_3n['wsj'] = df['wsj_mean_compound']
    df_3n['cnbc'] = df['cnbc_mean_compound']
    df_3n['fortune'] = dfn['fortune_noise']
    df_3n['reuters'] = df['reuters_mean_compound']
    df_3n['price'] = df['Adj Close']

    # df_4n represents the reuters company to increase noise
    df_4n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_4n['wsj'] = df['wsj_mean_compound']
    df_4n['cnbc'] = df['cnbc_mean_compound']
    df_4n['fortune'] = df['fortune_mean_compound']
    df_4n['reuters'] = dfn['reuters_noise']
    df_4n['price'] = df['Adj Close']

    # Copy data table
    df1 = df_1n
    df2 = df_2n
    df3 = df_3n
    df4 = df_4n

    # The training set size is 0.65
    split = (0.65)
    # The predicted sequence length is 10
    sequence_length = 10;
    # Data normalization
    normalise = True
    # Design the batch size of the neural network
    batch_size = 100;
    # The dimension of the input data is 5
    input_dim = 5
    # The length of input prediction data is 9, which means that the first 9 data are used to predict the 10th data
    input_timesteps = 9
    # Set the number of neurons for the neural network to 50
    neurons = 50
    # As the number of epochs increases, the number of weight update iterations increases, and the curve shifts from the initial unfitting state to the optimized fitting state
    epochs = 5
    # The number of predicted data at a time
    prediction_len = 1
    # dense layer Number of output data
    dense_output = 1
    # Sets the probability of data returning to zero
    drop_out = 0
    # Calculate the size of the partition training set
    i_split = int(len(df1) * split)
    # Select the columns to use
    cols = ['price', 'wsj', 'cnbc', 'fortune', 'reuters']
    # Partition training data
    data_train_1 = df1.get(cols).values[:i_split]
    data_train_2 = df2.get(cols).values[:i_split]
    data_train_3 = df3.get(cols).values[:i_split]
    data_train_4 = df4.get(cols).values[:i_split]
    # Gets the data set length
    len_train = len(data_train_1)
    len_train_windows = None
    # Adds a training set to the sliding window
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_1[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    # Set the parameters of the sliding window
    window_data = data_windows
    # Sets the number of sliding Windows
    win_num = window_data.shape[0]
    # Sets the number of columns
    col_num = window_data.shape[2]
    # Create the data container needed to record the maximum and minimum normalized data
    normalised_data = []
    # Record minimum values for calculation purposes
    record_min = []
    # Record the maximum value for calculation
    record_max = []
    # Carry out maximum minimization normalization
    for win_i in range(0, win_num):# Each sliding window is normalized
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        #Normalized data was recorded
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    # Get wsj company added noise training set
    x_train1 = data_windows[:, :-1]
    y_train1 = data_windows[:, -1, [0]]

    ##################################################################################################The following is repeated with the ordinary LSTM######################################################################################################################
    # get_train_data 
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_2[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    x_train2 = data_windows[:, :-1]
    y_train2 = data_windows[:, -1, [0]]

    # get_train_data 
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_3[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    x_train3 = data_windows[:, :-1]
    y_train3 = data_windows[:, -1, [0]]

    # get_train_data 
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_4[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    x_train4 = data_windows[:, :-1]
    y_train4 = data_windows[:, -1, [0]]

    # Put four data tables together
    x_train_t = np.concatenate((x_train1, x_train2, x_train3, x_train4), axis=0)
    # Copy the data for subsequent operations
    x_train = x_train_t
    # Put four data tables together
    y_train_t = np.concatenate((y_train1, y_train2, y_train3, y_train4), axis=0)
    # Copy the data for subsequent operations
    y_train = y_train_t

    # Read the noiseless data for prediction
    dataframe = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")
    # According to the data in these six columns
    dataframe.columns = ['date', 'wsj', 'cnbc', 'fortune', 'reuters', 'price']
    cols = ['price', 'wsj', 'cnbc', 'fortune', 'reuters']
    # Gets the number of rows in the table
    len_dataframe = dataframe.shape[0]
    # Gets the size of the test set and divides the data set
    i_split = int(len(dataframe) * split)
    data_test = dataframe.get(cols).values[i_split:]
    # Calculates the length of the test set
    len_test = len(data_test)
    len_train_windows = None

    # Add a test set to the sliding window
    data_windows = []
    for i in range(len_test - sequence_length):
        data_windows.append(data_test[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    # Save the initial test set for the final MSE calculation
    y_test_ori = data_windows[:, -1, [0]]
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []

    # normalize
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)

    # normalised_data=window_data
    data_windows = normalised_data  # get_test_data
    x_test = data_windows[:, :-1]
    y_test = data_windows[:, -1, [0]]

    # Create the LSTM model
    model = Sequential()
    # Build the LSTM hierarchy
    model.add(LSTM(neurons, input_shape=(input_timesteps, input_dim), return_sequences=True))
    model.add(Dropout(drop_out))
    model.add(LSTM(neurons, return_sequences=True))
    model.add(LSTM(neurons, return_sequences=False))
    model.add(Dropout(drop_out))
    model.add(Dense(dense_output, activation='linear'))
    # Compile model
    model.compile(loss='mean_squared_error',
                  optimizer='adam')
    # Fit the model
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

    # Initializes the container and parameter structure used by the prediction
    data = x_test
    prediction_seqs = []
    window_size = sequence_length
    pre_win_num = int(len(data) / prediction_len)
    #multi sequence predict
    for i in range(0, pre_win_num):#For each sliding window, make a prediction and add the prediction results to the next sliding window
        curr_frame = data[i * prediction_len]
        predicted = []
        for j in range(0, prediction_len):
            temp = model.predict(curr_frame[newaxis, :, :])[0]
            # Record the predicted results
            predicted.append(temp)
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size - 2], predicted[-1], axis=0)
        prediction_seqs.append(predicted)

    # Initializes the container and parameters used for de-normalization
    de_predicted = []
    len_pre_win = int(len(data) / prediction_len)
    len_pre = prediction_len
    # De-normalize the predicted data, that is, remove the maximum and minimum value normalization
    m = 0
    for i in range(0, len_pre_win):
        for j in range(0, len_pre):
            de_predicted.append(prediction_seqs[i][j][0] * record_max[m] + record_min[m])
            m = m + 1
    # Initialize the container and parameters for calculating MSE
    error = []
    diff = y_test.shape[0] - prediction_len * pre_win_num
    # The error was calculated by comparing the original test set with the de-normalized forecast data
    for i in range(y_test_ori.shape[0] - diff):
        error.append(y_test_ori[i,] - de_predicted[i])
    # Calculate the MSE by error
    squaredError = []
    absError = []
    for val in error:
        squaredError.append(val * val)

    MSE = sum(squaredError) / len(squaredError)
    print("*****************************************************************************************************")
    print("Sentimental-LSTM MSE:{}".format(MSE))
    print("*****************************************************************************************************")

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
dd = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")
dd.head()
#For data sets with sentiment analysis, positive values represent positive evaluations and negative values represent negative evaluations

Unnamed: 0,date,wsj_mean_compound,cnbc_mean_compound,fortune_mean_compound,reuters_mean_compound,Adj Close
0,2017/12/7,0.296,-0.1366,0.0,0.0,2636.97998
1,2017/12/8,0.0,0.0,-0.2423,0.0,2651.5
2,2017/12/11,0.0,0.0,0.0,0.0,2659.98999
3,2017/12/12,0.0,0.0,0.0,0.0,2664.110107
4,2017/12/13,0.0,0.0,0.0,0.0,2662.850098


In [26]:
# Experiment 3 The optimized LSTM based on Sentiment analysis The optimized LSTM based on the stock price data with sentiment analysis of four financial news
add_noise()# Gaussian noise is added to the data set to reduce the interference of fake news as a training set

*****************************************************************************************************
Gaussian noise was successfully added to the data set
*****************************************************************************************************


In [27]:
singleInputLSTM()#Basic LSTM model
twoWeekSentimentLSTM()#LSTM model based on sentiment analysis
# Conclusion 3: The optimization of LSTM based on sentiment analysis can improve the prediction accuracy

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
*****************************************************************************************************
LSTM-MSE:[1823.25330162]
*****************************************************************************************************
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
*****************************************************************************************************
Sentimental-LSTM MSE:[829.96458568]
*****************************************************************************************************


In [28]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import json
import time
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from numpy import newaxis
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

from math import pi,sqrt,exp,pow,log
from numpy.linalg import det, inv
from abc import ABCMeta, abstractmethod
from sklearn import cluster

import statsmodels.api as sm
import scipy.stats as scs
import scipy.optimize as sco
import scipy.interpolate as sci
from scipy import stats
def oneWeekSentimentLSTM():#Training set: Gaussian noise was added to each company, and four tables were aggregated together as the training set test set: the original stock price with sentiment analysis without noise added
    # Read and process generated noise data and raw data
    df = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")
    dfn = pd.read_csv("/content/drive/MyDrive/dp/source_price_noise.csv", index_col=0)

    # Create a data table for each company

    # df_1n indicates adding noise to wsj
    df_1n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_1n['wsj'] = dfn['wsj_noise']
    df_1n['cnbc'] = df['cnbc_mean_compound']
    df_1n['fortune'] = df['fortune_mean_compound']
    df_1n['reuters'] = df['reuters_mean_compound']
    df_1n['price'] = df['Adj Close']

    # df_2n represents the cnbc company to increase noise
    df_2n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_2n['wsj'] = df['wsj_mean_compound']
    df_2n['cnbc'] = dfn['cnbc_noise']
    df_2n['fortune'] = df['fortune_mean_compound']
    df_2n['reuters'] = df['reuters_mean_compound']
    df_2n['price'] = df['Adj Close']

    # df_3n represents fortune to increase noise
    df_3n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_3n['wsj'] = df['wsj_mean_compound']
    df_3n['cnbc'] = df['cnbc_mean_compound']
    df_3n['fortune'] = dfn['fortune_noise']
    df_3n['reuters'] = df['reuters_mean_compound']
    df_3n['price'] = df['Adj Close']

    # df_4n represents the reuters company to increase noise
    df_4n = pd.DataFrame()
    # Specifies the source of data for each column of the data table
    df_4n['wsj'] = df['wsj_mean_compound']
    df_4n['cnbc'] = df['cnbc_mean_compound']
    df_4n['fortune'] = df['fortune_mean_compound']
    df_4n['reuters'] = dfn['reuters_noise']
    df_4n['price'] = df['Adj Close']

    # Copy data table
    df1 = df_1n
    df2 = df_2n
    df3 = df_3n
    df4 = df_4n

    # The training set size is 0.65
    split = (0.65)
    ##########################################################################sequence_length changed to 5 and input_timesteps changed to 4, changing to a sliding window of one week, using the first four data to predict the fifth data#######################################################################################
    # The predicted sequence length is 5
    sequence_length = 5;
    # Data normalization
    normalise = True
    # Design the batch size of the neural network
    batch_size = 100;
    # The dimension of the input data is 5
    input_dim = 5
    # The length of input prediction data is 4, which means that the first 4 data are used to predict the fifth data
    input_timesteps = 4
    ############################################################################################################################################################################################################################################################
    # Set the number of neurons for the neural network to 50
    neurons = 50
    # As the number of epochs increases, the number of weight update iterations increases, and the curve shifts from the initial unfitting state to the optimized fitting state
    epochs = 5
    # The number of predicted data at a time
    prediction_len = 1
    # dense layer Number of output data
    dense_output = 1
    # Sets the probability of data returning to zero
    drop_out = 0

    # Calculate the size of the partition training set
    i_split = int(len(df1) * split)
    # Select the columns to use
    cols = ['price', 'wsj', 'cnbc', 'fortune', 'reuters']
    # Partition training data
    data_train_1 = df1.get(cols).values[:i_split]
    data_train_2 = df2.get(cols).values[:i_split]
    data_train_3 = df3.get(cols).values[:i_split]
    data_train_4 = df4.get(cols).values[:i_split]
    # Gets the data set length
    len_train = len(data_train_1)
    len_train_windows = None
    # Adds a training set to the sliding window
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_1[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    # Set the parameters of the sliding window
    window_data = data_windows
    # Sets the number of sliding Windows
    win_num = window_data.shape[0]
    # Sets the number of columns
    col_num = window_data.shape[2]
    # Create the data container needed to record the maximum and minimum normalized data
    normalised_data = []
    # Record minimum values for calculation purposes
    record_min = []
    # Record the maximum value for calculation
    record_max = []
    # Carry out maximum minimization normalization
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    # Get wsj company added noise training set
    x_train1 = data_windows[:, :-1]
    y_train1 = data_windows[:, -1, [0]]


    # get_train_data 
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_2[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    x_train2 = data_windows[:, :-1]
    y_train2 = data_windows[:, -1, [0]]

    # get_train_data 
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_3[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    x_train3 = data_windows[:, :-1]
    y_train3 = data_windows[:, -1, [0]]

    # get_train_data 
    data_windows = []
    for i in range(len_train - sequence_length):
        data_windows.append(data_train_4[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)
    # normalised_data=window_data
    data_windows = normalised_data
    x_train4 = data_windows[:, :-1]
    y_train4 = data_windows[:, -1, [0]]

    # Put four data tables together
    x_train_t = np.concatenate((x_train1, x_train2, x_train3, x_train4), axis=0)
    # Copy the data for subsequent operations
    x_train = x_train_t
    # Put four data tables together
    y_train_t = np.concatenate((y_train1, y_train2, y_train3, y_train4), axis=0)
    # Copy the data for subsequent operations
    y_train = y_train_t

    # Read the noiseless data for prediction
    dataframe = pd.read_csv("/content/drive/MyDrive/dp/source_price.csv")
    # According to the data in these six columns
    dataframe.columns = ['date', 'wsj', 'cnbc', 'fortune', 'reuters', 'price']
    cols = ['price', 'wsj', 'cnbc', 'fortune', 'reuters']
    # Gets the number of rows in the table
    len_dataframe = dataframe.shape[0]
    # Gets the size of the test set and divides the data set
    i_split = int(len(dataframe) * split)
    data_test = dataframe.get(cols).values[i_split:]
    # Calculates the length of the test set
    len_test = len(data_test)
    len_train_windows = None

    # Add a test set to the sliding window
    data_windows = []
    for i in range(len_test - sequence_length):
        data_windows.append(data_test[i:i + sequence_length])
    data_windows = np.array(data_windows).astype(float)
    # Save the initial test set for the final MSE calculation
    y_test_ori = data_windows[:, -1, [0]]

    window_data = data_windows
    win_num = window_data.shape[0]
    col_num = window_data.shape[2]
    normalised_data = []
    record_min = []
    record_max = []

    # normalize
    for win_i in range(0, win_num):
        normalised_window = []
        for col_i in range(0, 1):  # col_num):
            temp_col = window_data[win_i, :, col_i]
            temp_min = min(temp_col)
            if col_i == 0:
                record_min.append(temp_min)  # record min
            temp_col = temp_col - temp_min
            temp_max = max(temp_col)
            if col_i == 0:
                record_max.append(temp_max)  # record max
            temp_col = temp_col / temp_max
            normalised_window.append(temp_col)
        for col_i in range(1, col_num):
            temp_col = window_data[win_i, :, col_i]
            normalised_window.append(temp_col)
        normalised_window = np.array(normalised_window).T
        normalised_data.append(normalised_window)
    normalised_data = np.array(normalised_data)

    # normalised_data=window_data
    data_windows = normalised_data  # get_test_data
    x_test = data_windows[:, :-1]
    y_test = data_windows[:, -1, [0]]
    # Create the LSTM model
    model = Sequential()
    # Build the LSTM hierarchy
    model.add(LSTM(neurons, input_shape=(input_timesteps, input_dim), return_sequences=True))
    model.add(Dropout(drop_out))
    model.add(LSTM(neurons, return_sequences=True))
    model.add(LSTM(neurons, return_sequences=False))
    model.add(Dropout(drop_out))
    model.add(Dense(dense_output, activation='linear'))
    # Compile model
    model.compile(loss='mean_squared_error',
                  optimizer='adam')
    # Fit the model
    model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size)

    # Initializes the container and parameters used by the prediction
    data = x_test
    prediction_seqs = []
    window_size = sequence_length
    pre_win_num = int(len(data) / prediction_len)
    #multi sequence predict
    for i in range(0, pre_win_num):#For each sliding window, make a prediction and add the prediction results to the next sliding window
        curr_frame = data[i * prediction_len]
        predicted = []
        for j in range(0, prediction_len):
            temp = model.predict(curr_frame[newaxis, :, :])[0]
            # Record the predicted results and add them to the next slide window
            predicted.append(temp)
            curr_frame = curr_frame[1:]
            curr_frame = np.insert(curr_frame, [window_size - 2], predicted[-1], axis=0)
        prediction_seqs.append(predicted)

    # Initializes the container and parameters used for de-normalization
    de_predicted = []
    len_pre_win = int(len(data) / prediction_len)
    len_pre = prediction_len
    # De-normalize the predicted data, that is, remove the maximum and minimum value normalization
    m = 0
    for i in range(0, len_pre_win):
        for j in range(0, len_pre):
            de_predicted.append(prediction_seqs[i][j][0] * record_max[m] + record_min[m])
            m = m + 1
    # Initialize the container and parameters for calculating MSE
    error = []
    diff = y_test.shape[0] - prediction_len * pre_win_num
    # The error was calculated by comparing the original test set with the de-normalized forecast data
    for i in range(y_test_ori.shape[0] - diff):
        error.append(y_test_ori[i,] - de_predicted[i])
    # Calculate the MSE by error
    squaredError = []
    absError = []
    for val in error:
        squaredError.append(val * val)

    MSE = sum(squaredError) / len(squaredError)
    print("*****************************************************************************************************")
    print("one week Sentimental-LSTM MSE:{}".format(MSE))
    print("*****************************************************************************************************")

# 新段落

In [29]:
# Experiment 4 Change the sliding window from two weeks to one week to see the predicted effect
twoWeekSentimentLSTM()
oneWeekSentimentLSTM()
# Conclusion 4, a week for sliding window can achieve better results

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
*****************************************************************************************************
Sentimental-LSTM MSE:[864.14441897]
*****************************************************************************************************
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
*****************************************************************************************************
one week Sentimental-LSTM MSE:[582.97939972]
*****************************************************************************************************
