In [20]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import os

In [23]:
def remove_outliers(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data > lower_bound) & (data < upper_bound)]

In [25]:
# Directory containing all CSV files
def load_and_process_data(directory):
    all_data = pd.DataFrame()
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            data = pd.read_csv(file_path)
            data['Date'] = pd.to_datetime(data['TradingDate'], format='%d/%m/%Y')
            data['Returns'] = data['Close'].pct_change()
            data = data.dropna(subset=['Returns'])  # Drop NaN to ensure percentile calculation works
            clean_returns = remove_outliers(data['Returns'])
            data = data.loc[clean_returns.index]
            all_data = pd.concat([all_data, data], ignore_index=True)
    return all_data.dropna(subset=['Returns'])

def analyze_anomalies(data):
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data['Week'] = data['Date'].dt.isocalendar().week
    data['Month'] = data['Date'].dt.month

    # Analyze anomalies for each day of the week
    test_results = []
    days = data['DayOfWeek'].unique()
    for day in days:
        current_day_returns = data[data['DayOfWeek'] == day]['Returns']
        other_days_returns = data[data['DayOfWeek'] != day]['Returns']
        stat, p = ttest_ind(current_day_returns, other_days_returns, equal_var=False)
        test_results.append((day, stat, p))
    
    # Correct for multiple testing
    p_values = [result[2] for result in test_results]
    corrected_results = multipletests(p_values, alpha=0.05, method='bonferroni')
    corrected_p_values = corrected_results[1]
    rejections = corrected_results[0]

    # Print the results
    for (day, stat, p), corrected_p, reject in zip(test_results, corrected_p_values, rejections):
        print(f"Day {day}: T-statistic = {stat:.3f}, Original P-value = {p:.3f}, Corrected P-value = {corrected_p:.3f}, Anomaly Detected: {reject}")


# Main execution
directory = '/Users/juss/Documents/GitHub/data_mining_project/Data/Timeseries_Data'
data = load_and_process_data(directory)
analyze_anomalies(data)

Day 3: T-statistic = -3.717, Original P-value = 0.000, Corrected P-value = 0.001, Anomaly Detected: True
Day 4: T-statistic = -4.364, Original P-value = 0.000, Corrected P-value = 0.000, Anomaly Detected: True
Day 0: T-statistic = -13.443, Original P-value = 0.000, Corrected P-value = 0.000, Anomaly Detected: True
Day 1: T-statistic = 5.745, Original P-value = 0.000, Corrected P-value = 0.000, Anomaly Detected: True
Day 2: T-statistic = 16.215, Original P-value = 0.000, Corrected P-value = 0.000, Anomaly Detected: True
