In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import optuna
import json
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from optuna.integration import TFKerasPruningCallback

optuna.logging.set_verbosity(optuna.logging.WARNING)

# --- Configuration ---
N_STEPS = 15
VOCAB_SIZE = 5000
N_TRIALS = 5
N_SPLITS = 3
EPOCHS_PER_FOLD = 30

# === 1. Load and Preprocess Data ===
print("Loading and preprocessing data...")
try:
    df = pd.read_csv('../data/cleaned/LSAPP_Processed.csv')
except FileNotFoundError:
    print("Error: 'csv' not found.")
    exit()

# Preprocess timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['timestamp'].dt.hour
df['weekday'] = df['timestamp'].dt.weekday
df['hour_sin'] = np.sin(2 * np.pi * df['hour_of_day'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour_of_day'] / 24)

# Fill missing numerical values
if 'time_since_last_app' not in df.columns:
    df['time_since_last_app'] = 0
if 'is_weekend' not in df.columns:
    df['is_weekend'] = df['weekday'] >= 5

df = df[['user_id', 'app_name', 'timestamp', 'hour_sin', 'hour_cos', 'time_since_last_app', 'is_weekend', 'weekday']]

# Filter out rare apps
app_counts = df['app_name'].value_counts()
common_apps = app_counts[app_counts >= 40].index
df_filtered = df[df['app_name'].isin(common_apps)].copy()

# === 2. Tokenizer Setup ===
print("Preparing tokenizer...")
all_app_sequences = df_filtered.groupby('user_id')['app_name'].apply(lambda x: ' '.join(x)).tolist()
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(all_app_sequences)
word_index = tokenizer.word_index
oov_index = word_index.get(tokenizer.oov_token)
vocab_size = min(len(word_index) + 1, VOCAB_SIZE)

# === 3. Create Sequences with Additional Features ===
def create_sequences(data, tokenizer, n_steps, oov_index):
    sequences, targets = [], []
    extra_features = []
    data = data.sort_values(['user_id', 'timestamp'])
    for user_id, user_data in tqdm(data.groupby('user_id')):
        apps = user_data['app_name'].tolist()
        app_tokens = tokenizer.texts_to_sequences([apps])[0]
        hour_sin = user_data['hour_sin'].values
        hour_cos = user_data['hour_cos'].values
        time_since = user_data['time_since_last_app'].fillna(0).values
        weekend = user_data['is_weekend'].astype(int).values
        weekday = user_data['weekday'].values

        for i in range(len(app_tokens) - n_steps):
            input_seq = app_tokens[i:i+n_steps]
            target = app_tokens[i+n_steps]
            if target == oov_index:
                continue
            sequences.append(input_seq)
            targets.append(target)
            features = [
                hour_sin[i+n_steps],
                hour_cos[i+n_steps],
                time_since[i+n_steps],
                weekend[i+n_steps],
                weekday[i+n_steps]
            ]
            extra_features.append(features)
    return np.array(sequences), np.array(extra_features), np.array(targets)

X_seq, X_extra, y_raw = create_sequences(df_filtered, tokenizer, N_STEPS, oov_index)
X_seq_padded = pad_sequences(X_seq, maxlen=N_STEPS, padding='pre', truncating='pre')
y = to_categorical(y_raw, num_classes=vocab_size)
X_train_seq, X_test_seq, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
    X_seq_padded, X_extra, y, test_size=0.2, random_state=42)


# One-hot encode labels
y = to_categorical(y_raw, num_classes=vocab_size)

# Split the data
X_train_seq, X_test_seq, X_train_extra, X_test_extra, y_train, y_test = train_test_split(
    X_seq_padded, X_extra, y, test_size=0.2, random_state=42)

# Save train set
np.savez_compressed("../data/processed/train_data_2.npz",
                    X_seq=X_train_seq,
                    X_extra=X_train_extra,
                    y=y_train)

# Save test set
np.savez_compressed("../data/processed/test_data_2.npz",
                    X_seq=X_test_seq,
                    X_extra=X_test_extra,
                    y=y_test)

# Save tokenizer separately if needed
import pickle
with open("../data/processed/tokenizer_2.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Also save vocab info
np.savez_compressed("../data/processed/meta_2.npz",
                    vocab_size=np.array([vocab_size]),
                    oov_index=np.array([oov_index]))


Loading and preprocessing data...
Preparing tokenizer...


100%|██████████| 292/292 [00:00<00:00, 339.17it/s]


In [None]:
import json
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import TopKCategoricalAccuracy
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

# Load training data
train = np.load("../data/processed/train_data_2.npz")
X_train_seq = train['X_seq']
X_train_extra = train['X_extra']
y_train = train['y']

# Load test data
test = np.load("../data/processed/test_data_2.npz")
X_test_seq = test['X_seq']
X_test_extra = test['X_extra']
y_test = test['y']

# Load tokenizer
with open("../data/processed/tokenizer_2.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Load metadata
meta = np.load("../data/processed/meta_2.npz")
vocab_size = int(meta['vocab_size'][0])
oov_index = int(meta['oov_index'][0])

# === Load hyperparameters from JSON ===
with open("Hyperparameters.json", "r") as f:
    best_params = json.load(f)

print("Using hyperparameters:", best_params)

# === Fixed model creation function using loaded hyperparameters ===
def create_model_with_params():
    embedding_dim = best_params['embedding_dim']
    lstm_units = best_params['lstm_units']
    dense_units = best_params['dense_units']
    dropout_rate = best_params['dropout_rate']
    learning_rate = best_params['learning_rate']

    # Sequence input
    seq_input = Input(shape=(N_STEPS,), name='sequence_input')
    x_seq = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(seq_input)
    x_seq = LSTM(lstm_units, dropout=dropout_rate)(x_seq)

    # Extra numeric input
    extra_input = Input(shape=(5,), name='extra_features')

    # Combine
    x = Concatenate()([x_seq, extra_input])
    x = Dense(dense_units, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    output = Dense(vocab_size, activation='softmax')(x)

    model = Model(inputs=[seq_input, extra_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=learning_rate, clipnorm=1.0),
                  loss='categorical_crossentropy',
                  metrics=['accuracy', TopKCategoricalAccuracy(k=3), TopKCategoricalAccuracy(k=5)])
    return model

# === Direct model training (skip Optuna) ===
print("Training model with fixed hyperparameters...")
model = create_model_with_params()

# Train the model
model.fit([X_train_seq, X_train_extra], y_train,
          validation_data=([X_test_seq, X_test_extra], y_test),
          epochs=100,
          batch_size=512, # <--- This is the cause of all your problems
          callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
          verbose=1)

# === Evaluation ===
eval_results = model.evaluate([X_test_seq, X_test_extra], y_test)
print("\nFinal Evaluation:")
for name, val in zip(model.metrics_names, eval_results):
    print(f"{name}: {val:.4f}")

y_pred_proba = model.predict([X_test_seq, X_test_extra])
y_pred_proba[:, oov_index] = 0  # Mask <OOV>
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_test, axis=1)

reverse_word_index = {v: k for k, v in tokenizer.word_index.items()}
target_names = [reverse_word_index.get(i, '<UNK>') for i in sorted(set(np.concatenate([y_pred, y_true])))]
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm[:15, :15], annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Top 15 Classes)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Save the trained model
model.save("App Predictor Model.keras")
print("Model saved as 'App Predictor Model.keras'")

Using hyperparameters: {'embedding_dim': 64, 'lstm_units': 128, 'dense_units': 256, 'dropout_rate': 0.36059802347094455, 'learning_rate': 0.00046817343838393284}
Training model with fixed hyperparameters...
Epoch 1/100


: 