In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras import Sequential
from keras.layers import Input, SimpleRNN, Dropout, Dense
from tf.optimizers import Adam
from scikeras.wrappers import KerasRegressor

ModuleNotFoundError: No module named 'tf'

In [19]:
pip install scikeras

Collecting scikeras
  Downloading scikeras-0.11.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.11.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
df = pd.read_pickle('datasets/features-label-text-represented-subset.pkl')
df_copy = df.copy()

In [3]:
train_ids = pd.read_csv('train-test-ids/train-ids.csv')
test_ids = pd.read_csv('train-test-ids/test-ids.csv')

In [4]:
train_data = df_copy[df_copy['file_id'].isin(train_ids['file_id'].astype(str))]
test_data = df_copy[df_copy['file_id'].isin(test_ids['file_id'].astype(str))]

In [5]:
# split train data into train and validation sets
groups = train_data.groupby('CIK')

train_sub = []
val_data = []
random_seed = 42

for _, group in groups:
    if len(group) == 1:
        train_sub.append(group)
        continue
    if len(group) >= 7:
        test_size = 2
    else:
        test_size = 1

    train_group, val_group = train_test_split(group, test_size=test_size, random_state = random_seed)
    train_sub.append(train_group)
    val_data.append(val_group)

train_sub = pd.concat(train_sub)
val_data = pd.concat(val_data)

Model1.2 ROE_t, text_vector_t -> ROE_t+1

Model1.2 RNN

In [6]:
def plot_fit_loss(model_fit):
    plt.plot(model_fit.history['loss'], label='loss')
    plt.plot(model_fit.history['val_loss'], label='val_loss')
    plt.ylim([0.02, 0.05])
    plt.xlabel('Epoch')
    plt.ylabel('Error [ROE_t+1]')
    plt.legend()
    plt.grid(True)

In [7]:
# feature concatenation
# normalisation
train_sub_features_m1_2 = np.concatenate((np.array(train_sub['text_vector'].tolist()), train_sub['roe'].values.reshape(-1, 1)), axis=1)
train_sub_label_m1_2 = train_sub['roe_next_year'].values
val_features_m1_2 = np.concatenate((np.array(val_data['text_vector'].tolist()), val_data['roe'].values.reshape(-1, 1)), axis=1)
val_label_m1_2 = val_data['roe_next_year'].values
test_features_m1_2 = np.concatenate((np.array(test_data['text_vector'].tolist()), test_data['roe'].values.reshape(-1, 1)), axis=1)
test_label_m1_2 = test_data['roe_next_year'].values

train_sub_features_m1_2_norm = (train_sub_features_m1_2 - np.mean(train_sub_features_m1_2, axis=0)) / np.std(train_sub_features_m1_2, axis=0)
val_features_m1_2_norm = (val_features_m1_2 - np.mean(val_features_m1_2, axis=0)) / np.std(val_features_m1_2, axis=0)
test_features_m1_2_norm = (test_features_m1_2 - np.mean(test_features_m1_2, axis=0)) / np.std(test_features_m1_2, axis=0)

In [15]:
# Define RNN parameters
input_shape = (101, 1)

# Create the RNN model using tf.keras
model1_2_rnn = Sequential([
    Input(shape=input_shape),
    SimpleRNN(64), # default activation='tanh'
    Dropout(0.1),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(1)
])
model1_2_rnn.summary()

Model: "sequential_1168"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1168 (SimpleRNN)  (None, 64)               4224      
                                                                 
 dropout_2336 (Dropout)      (None, 64)                0         
                                                                 
 dense_2336 (Dense)          (None, 64)                4160      
                                                                 
 dropout_2337 (Dropout)      (None, 64)                0         
                                                                 
 dense_2337 (Dense)          (None, 1)                 65        
                                                                 
Total params: 8,449
Trainable params: 8,449
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Compile the model
model1_2_rnn.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Train the model
batch_size = 128
epochs = 30

model1_2_rnn_fit = model1_2_rnn.fit(
    train_sub_features_m1_2_norm,
    train_sub_label_m1_2,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(val_features_m1_2_norm, val_label_m1_2)
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
plot_fit_loss(model1_2_rnn_fit)

In [12]:
# Evaluate the model on the test set
test_loss = model1_2_rnn.evaluate(test_features_m1_2_norm, test_label_m1_2)
test_loss



0.03738980367779732

Model tuning

In [11]:
def create_rnn_model(rnn_units=64, rnn_activation='tanh', learning_rate=0.001):
    model = Sequential()
    model.add(Input(shape=(101,1)))
    model.add(SimpleRNN(rnn_units, activation=rnn_activation))
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1))
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    return model

In [12]:
model_rnn = KerasRegressor(model=create_rnn_model, verbose=0, rnn_units=64, rnn_activation='tanh', learning_rate=0.001)

In [13]:
# Define hyperparameters to tune
param_grid = {
    'rnn_units': [32, 64, 128],
    'rnn_activation': ['tanh', 'relu'],
    'learning_rate': [0.001, 0.01, 0.1],
    'epochs': [10, 30],
    'batch_size': [64, 128]
}

model_rnn_tune = GridSearchCV(estimator=model_rnn, param_grid=param_grid)
model_rnn_tune.fit() # train data, not train sub

Traceback (most recent call last):
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/scikeras/wrappers.py", line 1117, in score
    return self.scorer(y, y_pred, sample_weight=sample_weight, **score_args)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/scikeras/wrappers.py", line 1714, in scorer
    return sklearn_r2_score(y_true, y_pred, **kwargs)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 911, in r2_score
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/anaconda3/envs/tensorflow_2_

Traceback (most recent call last):
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/scikeras/wrappers.py", line 1117, in score
    return self.scorer(y, y_pred, sample_weight=sample_weight, **score_args)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/scikeras/wrappers.py", line 1714, in scorer
    return sklearn_r2_score(y_true, y_pred, **kwargs)
  File "/opt/anaconda3/envs/tensorflow_2_10_0/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 911, in r2_score
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/opt/anaconda3/envs/tensorflow_2_

KeyboardInterrupt: 