In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

class StockDataProcessor:

    def __init__(self):
        self.scaler = RobustScaler()

    def load_and_merge_data(self, df:pd.DataFrame):

        data = df.copy()
        data['date'] = pd.to_datetime(data['date'])

        return data

    def create_target_variable(self, df, return_col='ret'):

        df['return_2c'] = (df["return"l] > 0).astype(int)
        df["target"] = df["return_2c"].shift(1)

        return df


    def remove_outliers(self, df, factor_cols, n_std=3):

        for col in factor_cols:
            mean = df[col].mean()
            std = df[col].std()
            df = df.mask(abs(df[col] - mean) <= n_std * std)

        return df

    def handle_missing_values(self, df, factor_cols):

        df[factor_cols] = df.groupby('permno')[factor_cols].fillna(method='ffill')
        df[factor_cols] = df.groupby('permno')[factor_cols].fillna(method='bfill')
        df[factor_cols] = df[factor_cols].fillna(df[factor_cols].median())

        return df

    def create_lagged_features(self, df, factor_cols, lags=[1, 2, 3]):

        for lag in lags:
            for col in factor_cols:
                df[f'{col}_lag{lag}'] = df.groupby('permno')[col].shift(lag)

        return df.dropna()

    def prepare_features_and_target(self, df, factor_cols):

        feature_cols = [col for col in df.columns if any(factor in col for factor in factor_cols)]

        X = df[feature_cols].values
        y = df['target'].values

        X_scaled = self.scaler.fit_transform(X)

        return X_scaled, y, feature_cols

class DeepStockClassifier(nn.Module):

    def __init__(self, input_dim, hidden_dims=[256, 128, 64, 32], dropout_rate=0.3):
        super(DeepStockClassifier, self).__init__()

        layers = []
        prev_dim = input_dim

        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim

        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())

        self.network = nn.Sequential(*layers)

        self._initialize_weights()

    def _initialize_weights(self):

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        return self.network(x)

class ModelTrainer:

    def __init__(self, model, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.model = model.to(device)
        self.device = device
        self.train_losses = []
        self.val_losses = []
        self.train_accuracies = []
        self.val_accuracies = []

    def train_model(self, X_train, y_train, X_val, y_val,
                   batch_size=1024, epochs=100, learning_rate=0.001,
                   weight_decay=1e-4, patience=10):

        train_dataset = TensorDataset(
            torch.FloatTensor(X_train),
            torch.FloatTensor(y_train)
        )
        val_dataset = TensorDataset(
            torch.FloatTensor(X_val),
            torch.FloatTensor(y_val)
        )

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        optimizer = optim.Adam(self.model.parameters(),
                             lr=learning_rate, weight_decay=weight_decay)
        criterion = nn.BCELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=5, verbose=True
        )

        best_val_loss = float('inf')
        patience_counter = 0

        for epoch in range(epochs):

            self.model.train()
            train_loss = 0.0
            train_correct = 0
            train_total = 0

            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)

                optimizer.zero_grad()
                outputs = self.model(batch_X).squeeze()
                loss = criterion(outputs, batch_y)
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

                optimizer.step()

                train_loss += loss.item()
                predicted = (outputs > 0.5).float()
                train_total += batch_y.size(0)
                train_correct += (predicted == batch_y).sum().item()

            self.model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0

            with torch.no_grad():
                for batch_X, batch_y in val_loader:
                    batch_X, batch_y = batch_X.to(self.device), batch_y.to(self.device)

                    outputs = self.model(batch_X).squeeze()
                    loss = criterion(outputs, batch_y)

                    val_loss += loss.item()
                    predicted = (outputs > 0.5).float()
                    val_total += batch_y.size(0)
                    val_correct += (predicted == batch_y).sum().item()

            avg_train_loss = train_loss / len(train_loader)
            avg_val_loss = val_loss / len(val_loader)
            train_acc = train_correct / train_total
            val_acc = val_correct / val_total

            self.train_losses.append(avg_train_loss)
            self.val_losses.append(avg_val_loss)
            self.train_accuracies.append(train_acc)
            self.val_accuracies.append(val_acc)

            scheduler.step(avg_val_loss)

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience_counter = 0
                torch.save(self.model.state_dict(), 'best_model.pth')
            else:
                patience_counter += 1

            if epoch % 10 == 0:
                print(f'Epoch [{epoch}/{epochs}] - '
                      f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}, '
                      f'Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}')

            if patience_counter >= patience:
                print(f'Early stopping at epoch {epoch}')
                break

        self.model.load_state_dict(torch.load('best_model.pth'))
        print("Training completed!")

    def evaluate_model(self, X_test, y_test):

        self.model.eval()

        with torch.no_grad():
            X_test_tensor = torch.FloatTensor(X_test).to(self.device)
            outputs = self.model(X_test_tensor).squeeze()
            y_pred_proba = outputs.cpu().numpy()
            y_pred = (y_pred_proba > 0.5).astype(int)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_proba)

        print(f"Test Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"AUC: {auc:.4f}")

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }

    def plot_training_history(self):

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

        ax1.plot(self.train_losses, label='Train Loss', color='blue')
        ax1.plot(self.val_losses, label='Validation Loss', color='red')
        ax1.set_title('Model Loss')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.legend()
        ax1.grid(True)

        ax2.plot(self.train_accuracies, label='Train Accuracy', color='blue')
        ax2.plot(self.val_accuracies, label='Validation Accuracy', color='red')
        ax2.set_title('Model Accuracy')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('Accuracy')
        ax2.legend()
        ax2.grid(True)

        plt.tight_layout()
        plt.show()

def main():

    processor = StockDataProcessor()

    df = df

    df = processor.create_target_variable(df)

    df = processor.remove_outliers(df, factor_cols)
    df = processor.handle_missing_values(df, factor_cols)

    df = processor.create_lagged_features(df, factor_cols, lags=[1, 2])

    X, y, feature_names = processor.prepare_features_and_target(df, factor_cols)

    print(f"Data Shape: X={X.shape}, y={y.shape}")


    df_sorted = df.sort_values('date')
    n_total = len(df_sorted)
    n_train = int(n_total * 0.7)
    n_val = int(n_total * 0.15)

    train_idx = df_sorted.index[:n_train]
    val_idx = df_sorted.index[n_train:n_train+n_val]
    test_idx = df_sorted.index[n_train+n_val:]

    X_train, X_val, X_test = X[train_idx], X[val_idx], X[test_idx]
    y_train, y_val, y_test = y[train_idx], y[val_idx], y[test_idx]

    print(f"Train_set: {len(X_train)}, Validation_set: {len(X_val)}, Test_set: {len(X_test)}")


    input_dim = X_train.shape[1]
    model = DeepStockClassifier(
        input_dim=input_dim,
        hidden_dims=[512, 256, 128, 64],
        dropout_rate=0.3
    )

    trainer = ModelTrainer(model)
    trainer.train_model(
        X_train, y_train, X_val, y_val,
        batch_size=2048,
        epochs=100,
        learning_rate=0.001,
        patience=15
    )


    results = trainer.evaluate_model(X_test, y_test)

    return model, processor, results

if __name__ == "__main__":
    model, processor, results = main()