In [2]:
from scipy.stats import ttest_ind, chi2_contingency
import pandas as pd
import numpy as np

# Load the dataset for GLD
data_gld = pd.read_csv('../dataset/cleaned_GLD.csv')

# Convert Date to datetime for easier time-based splitting
data_gld['Date'] = pd.to_datetime(data_gld['Date'])


In [3]:
# Split the data into two time periods (before and after the midpoint date)
midpoint_date_gld = data_gld['Date'].median()
group1_gld = data_gld[data_gld['Date'] <= midpoint_date_gld]['Close']
group2_gld = data_gld[data_gld['Date'] > midpoint_date_gld]['Close']

# Perform t-test
t_stat_gld, t_p_value_gld = ttest_ind(group1_gld, group2_gld, equal_var=False)

# Display T-test results
print("T-test Results for GLD:")
print("T-statistic:", t_stat_gld)
print("P-value:", t_p_value_gld)


T-test Results for GLD:
T-statistic: -27.61580370509633
P-value: 1.0287941974352259e-134


In [4]:
# Categorize 'Close' column based on median
median_close_gld = data_gld['Close'].median()
data_gld['Close_Category'] = np.where(data_gld['Close'] >= median_close_gld, 'High', 'Low')

# Create a contingency table for chi-square test
contingency_table_gld = pd.crosstab(data_gld['Date'].dt.year, data_gld['Close_Category'])
chi2_stat_gld, chi2_p_value_gld, _, _ = chi2_contingency(contingency_table_gld)

# Display Chi-square Test results
print("\nChi-square Test Results for GLD:")
print("Chi-square Statistic:", chi2_stat_gld)
print("P-value:", chi2_p_value_gld)



Chi-square Test Results for GLD:
Chi-square Statistic: 732.4625749354674
P-value: 4.694045253021661e-156


In [5]:
import scipy.stats as stats

# Calculate the mean and standard error of the 'Close' column
mean_close_gld = data_gld['Close'].mean()
std_error_gld = stats.sem(data_gld['Close'])

# Calculate the 95% confidence interval
confidence_interval_gld = stats.t.interval(0.95, len(data_gld['Close'])-1, loc=mean_close_gld, scale=std_error_gld)

print("95% Confidence Interval for the mean 'Close' value of GLD:")
print("Lower bound:", confidence_interval_gld[0])
print("Upper bound:", confidence_interval_gld[1])


95% Confidence Interval for the mean 'Close' value of GLD:
Lower bound: 169.44788924200165
Upper bound: 172.187151091694


In [6]:
# Display T-test results with interpretation
print("T-test Results for GLD:")
print(f"T-statistic: {t_stat_gld:.4f}")
print(f"P-value: {t_p_value_gld:.4e}")
if t_p_value_gld < 0.05:
    print("Interpretation: The p-value is below 0.05, indicating a statistically significant difference in 'Close' prices between the two periods.\n")
else:
    print("Interpretation: The p-value is above 0.05, indicating no statistically significant difference in 'Close' prices between the two periods.\n")

# Display Chi-square test results with interpretation
print("Chi-square Test Results for GLD:")
print(f"Chi-square Statistic: {chi2_stat_gld:.4f}")
print(f"P-value: {chi2_p_value_gld:.4e}")
if chi2_p_value_gld < 0.05:
    print("Interpretation: The p-value is below 0.05, suggesting a statistically significant association between the years and high/low 'Close' categories.")
else:
    print("Interpretation: The p-value is above 0.05, indicating no statistically significant association between the years and high/low 'Close' categories.")


T-test Results for GLD:
T-statistic: -27.6158
P-value: 1.0288e-134
Interpretation: The p-value is below 0.05, indicating a statistically significant difference in 'Close' prices between the two periods.

Chi-square Test Results for GLD:
Chi-square Statistic: 732.4626
P-value: 4.6940e-156
Interpretation: The p-value is below 0.05, suggesting a statistically significant association between the years and high/low 'Close' categories.
