In [None]:
import numpy as np
import pandas as pd
import keras
from keras import layers
from matplotlib import pyplot as plt

# Load the data
nab_data_url = "https://raw.githubusercontent.com/numenta/NAB/master/data/realTweets/Twitter_volume_IBM.csv"
df_twitter_volume = pd.read_csv(nab_data_url, parse_dates=True, index_col="timestamp")

plt.figure(figsize=(12, 6))
plt.plot(df_twitter_volume.index, df_twitter_volume["value"], label="Twitter Volume")
plt.xlabel("Timestamp")
plt.ylabel("Volume")
plt.title("Twitter Volume Data for IBM")
plt.legend()
plt.show()

# Normalize the training data
training_mean = df_twitter_volume["value"].mean()
training_std = df_twitter_volume["value"].std()
df_training_value = (df_twitter_volume["value"] - training_mean) / training_std
print("Number of training samples:", len(df_training_value))

# Create sequences (adjust sequence length as needed)
sequence_length = 288  # 288 timesteps per day
sequences = []
for i in range(len(df_training_value) - sequence_length):
    sequences.append(df_training_value[i : i + sequence_length])

# Convert sequences to numpy array
X_train = np.array(sequences)

# Define the Autoencoder architecture
input_dim = X_train.shape[1]
encoding_dim = 64

autoencoder = keras.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(encoding_dim, activation="relu"),
    layers.Dense(input_dim, activation="linear")
])

# Compile the model
autoencoder.compile(optimizer="adam", loss="mean_squared_error")

# Train the Autoencoder
history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=64, validation_split=0.1, shuffle=True)

# Plot training loss
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Autoencoder Training Loss")
plt.legend()
plt.show()

# Detect anomalies
reconstructed_X = autoencoder.predict(X_train)
mse = np.mean(np.power(X_train - reconstructed_X, 2), axis=1)
threshold = np.percentile(mse, 95)  # Set a threshold for anomaly detection

# Identify anomalies
anomalies = df_twitter_volume.iloc[sequence_length:][mse > threshold]

# Visualize anomalies
plt.figure(figsize=(12, 6))
plt.plot(df_twitter_volume.index, df_twitter_volume["value"], label="Twitter Volume")
plt.scatter(anomalies.index, anomalies["value"], color="red", label="Anomalies")
plt.xlabel("Timestamp")
plt.ylabel("Volume")
plt.title("Twitter Volume Anomalies for IBM")
plt.legend()
plt.show()


In [None]:
anomalies.shape

(781, 1)