In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from keras.models import Sequential
from keras.layers import LSTM, Dense
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from keras.optimizers import Adam

In [2]:
# Load data
data = pd.read_csv('processed_dataset_friday.csv')

In [3]:

# Separate features and labels
X = data.drop(['Label','Timestamp'], axis=1)
y = data['Label']

In [4]:

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [6]:

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [9]:
print(y_train.value_counts()[[0]].sum())
print(y_train.value_counts()[[1]].sum())

1801909
1801909


In [10]:

# Compute correlation matrix
corr_matrix = data.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

  corr_matrix = data.corr().abs()


In [11]:

# Select top 15 features based on correlation with label
best_features_15 = upper_tri.nlargest(15, 'Label')['Label'].index.tolist()

# Select top 20 features based on correlation with label
best_features_20 = upper_tri.nlargest(20, 'Label')['Label'].index.tolist()

# Select top 30 features based on correlation with label
best_features_30 = upper_tri.nlargest(30, 'Label')['Label'].index.tolist()

In [12]:


# Iterate through feature sets and train LSTM model
for feature_set in [best_features_15, best_features_20, best_features_30]:
    # Convert feature names to indices
    feature_indices = [X.columns.get_loc(feature_name) for feature_name in feature_set]
    X_train_subset = X_train[:, np.array(feature_indices)]
    X_test_subset = X_test[:, np.array(feature_indices)]

    # Add Timestamp column to X_train and X_test
    X_train_subset = np.hstack((X_train_subset, np.arange(len(X_train_subset)).reshape(-1,1)))
    X_test_subset = np.hstack((X_test_subset, np.arange(len(X_test_subset)).reshape(-1,1)))

    # Reshape X_train and X_test to match LSTM input shape
    X_train_subset = X_train_subset.reshape((X_train_subset.shape[0], 1, X_train_subset.shape[1]))
    X_test_subset = X_test_subset.reshape((X_test_subset.shape[0], 1, X_test_subset.shape[1]))

    # Create and fit LSTM model
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train_subset.shape[1], X_train_subset.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy'])
    model.fit(X_train_subset, y_train, epochs=10, batch_size=32, verbose=1)

    # Make predictions and calculate accuracy
    y_pred_prob = model.predict(X_test_subset)
    y_pred = np.round(y_pred_prob)
    accuracy = accuracy_score(y_test, y_pred)
    print('Feature set:', feature_set)
    print('Accuracy:', accuracy)
    # Plot confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.title('Confusion matrix')
    plt.show()

  super().__init__(name, **kwargs)


Epoch 1/10

KeyboardInterrupt: 