In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score

# Load the dataset
file_path = 'SP600_AdjClose_Volume_Return.csv'
data = pd.read_csv(file_path)

# Data Cleaning
data_cleaned = data.dropna(subset=['Return']).drop_duplicates()
data_cleaned['Return'] = pd.to_numeric(data_cleaned['Return'], errors='coerce')
data_cleaned['Date'] = pd.to_datetime(data_cleaned['Date'])
data_cleaned = data_cleaned.dropna(subset=['Return'])

# Adding New Features (rolling return, rolling volatility, RSI, and volume)
data_cleaned['rolling_return'] = data_cleaned.groupby('Ticker')['Return'].rolling(window=10).mean().reset_index(0, drop=True)
data_cleaned['rolling_volatility'] = data_cleaned.groupby('Ticker')['Return'].rolling(window=10).std().reset_index(0, drop=True)

# RSI calculation
def calculate_rsi(series, period=14):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

# Apply RSI calculation and reset index
data_cleaned['rsi'] = data_cleaned.groupby('Ticker')['Return'].apply(calculate_rsi).reset_index(level=0, drop=True)

# Standardization for the four features
scaler = StandardScaler()
data_cleaned[['rolling_return_scaled', 'rolling_volatility_scaled', 'rsi_scaled', 'volume_scaled']] = scaler.fit_transform(
    data_cleaned[['rolling_return', 'rolling_volatility', 'rsi', 'Volume']])

# Impute missing values
imputer = SimpleImputer(strategy='mean')
data_cleaned[['rolling_return_scaled', 'rolling_volatility_scaled', 'rsi_scaled', 'volume_scaled']] = imputer.fit_transform(
    data_cleaned[['rolling_return_scaled', 'rolling_volatility_scaled', 'rsi_scaled', 'volume_scaled']])

# Define feature list
features = ['rolling_return_scaled', 'rolling_volatility_scaled', 'rsi_scaled', 'volume_scaled']

# Function to train and evaluate a one-class SVM model on the entire dataset
def train_and_evaluate_one_class_svm(features, data, nu=0.05, kernel='rbf', gamma='scale'):
    # One-Class SVM model
    oc_svm = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma)
    
    # Fit the model on the entire dataset
    oc_svm.fit(data[features])

    # Get anomaly scores (decision_function)
    data['svm_anomaly_score'] = oc_svm.decision_function(data[features])

    # Set threshold for anomalies (e.g., 5th percentile)
    threshold = np.percentile(data['svm_anomaly_score'], 5)
    data['svm_anomaly'] = data['svm_anomaly_score'].apply(lambda x: -1 if x < threshold else 1)

    # Count the anomalies
    svm_anomaly_count = data['svm_anomaly'].value_counts()
    print(f"SVM Anomaly Counts:\n{svm_anomaly_count}")

    # Plot the SVM results
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=data, x='Date', y='Return', hue='svm_anomaly', palette={1: 'blue', -1: 'red'})
    plt.title('Anomaly Detection using One-Class SVM on Entire Dataset')
    plt.xlabel('Date')
    plt.ylabel('Stock Return')
    plt.xticks(rotation=45)
    plt.legend(title='SVM Anomaly')
    plt.show()

    # Plot the distribution of anomaly scores
    plt.figure(figsize=(10, 6))
    sns.histplot(data['svm_anomaly_score'], bins=50, kde=True)
    plt.axvline(threshold, color='red', linestyle='--', label='Anomaly Threshold')
    plt.title('Distribution of One-Class SVM Anomaly Scores')
    plt.xlabel('Anomaly Score')
    plt.ylabel('Frequency')
    plt.legend()
    plt.show()

    # Optional: Evaluate precision, recall, and F1 if you have a way to determine true anomalies
    # Assume negative returns are true anomalies for evaluation
    true_anomalies = [-1 if x < 0 else 1 for x in data['Return']]
    precision = precision_score(true_anomalies, data['svm_anomaly'])
    recall = recall_score(true_anomalies, data['svm_anomaly'])
    f1 = f1_score(true_anomalies, data['svm_anomaly'])

    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Export SVM anomalies to CSV
    svm_anomalies = data[data['svm_anomaly'] == -1]
    svm_anomalies_file_path = 'svm_anomalies.csv'
    svm_anomalies.to_csv(svm_anomalies_file_path, index=False)
    print(f"SVM anomalies have been saved to {svm_anomalies_file_path}")

    return svm_anomaly_count

# Run the One-Class SVM model on the entire dataset
print("Running One-Class SVM on the entire dataset:")
svm_anomaly_count = train_and_evaluate_one_class_svm(features, data_cleaned)
