In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [6]:
import numpy as np
import pandas as pd
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
# Reading the saved data pickle file
df_stocks = pd.read_pickle('F:/STAT3011/pickled_ten_year_filtered_lead_para.pkl')

df_stocks['prices'] = df_stocks['adj close'].apply(np.int64)

# selecting the prices and articles
df_stocks = df_stocks[['prices', 'articles']]

df_stocks['articles'] = df_stocks['articles'].map(lambda x: x.lstrip('.-'))

df = df_stocks[['prices']].copy()

In [8]:

import nltk
nltk.download('vader_lexicon')
import unicodedata

sid = SentimentIntensityAnalyzer()

# Ensure df_stocks is your DataFrame with an 'articles' column
# Initialize new columns for sentiment scores
df_stocks['compound'] = 0.0
df_stocks['neg'] = 0.0
df_stocks['neu'] = 0.0
df_stocks['pos'] = 0.0

for date, row in df_stocks.iterrows():
    try:
        # Normalize and encode the article text
        sentence = unicodedata.normalize('NFKD', row['articles']).encode('ascii', 'ignore').decode('utf-8')
        ss = sid.polarity_scores(sentence)
        df_stocks.at[date, 'compound'] = ss['compound']
        df_stocks.at[date, 'neg'] = ss['neg']
        df_stocks.at[date, 'neu'] = ss['neu']
        df_stocks.at[date, 'pos'] = ss['pos']
    except TypeError as e:
        print(row['articles'])
        print(date)
        print(e)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


NameError: name 'SentimentIntensityAnalyzer' is not defined

In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Selecting prices and compound sentiment scores for simplicity; you can choose more
features = df_stocks[['prices', 'compound']].values

# Scaling features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler.fit_transform(features)


In [None]:
def create_sequences(data, time_step=100):
    X, y = [], []
    for i in range(len(data) - time_step - 1):
        X.append(data[i:(i + time_step), :])
        y.append(data[i + time_step, 0])  # Assuming 'prices' is at index 0
    return np.array(X), np.array(y)

time_step = 100
X, y = create_sequences(scaled_features, time_step)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from keras_self_attention import SeqSelfAttention

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    SeqSelfAttention(attention_activation='sigmoid'),
    Dropout(0.2),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(25),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Flatten

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    attention = MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    attention = Dropout(dropout)(attention)
    attention = LayerNormalization(epsilon=1e-6)(attention + inputs)
    
    # Feed Forward Part
    outputs = Dense(ff_dim, activation="relu")(attention)
    outputs = Dense(inputs.shape[-1])(outputs)
    outputs = Dropout(dropout)(outputs)
    outputs = LayerNormalization(epsilon=1e-6)(outputs + attention)
    
    return outputs

def build_transformer_model(time_steps, features, head_size=256, num_heads=4, ff_dim=4, num_transformer_blocks=4, mlp_units=[128], dropout=0.2, mlp_dropout=0.2):
    inputs = Input(shape=(time_steps, features))
    x = inputs
    
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    
    x = GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = Dense(dim, activation="relu")(x)
        x = Dropout(mlp_dropout)(x)
    outputs = Dense(1)(x)
    
    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

# Assuming you have prepared your data as X_train, y_train, X_test, y_test
model = build_transformer_model(X_train.shape[1], X_train.shape[2])
model.summary()

# Train your model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)
