# Exploratory data analysis
#### In this notebook, we carry out some EDA on the features in the dataset

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load the dataset
data = pd.read_csv('../data/dataset.csv')


In [None]:
from scipy.stats import shapiro
from sklearn.preprocessing import StandardScaler

# Understand the data structure
print("Dataset Info:")
print(data.info())
print("\nSummary Statistics:")
print(data.describe())

# Detect anomalies or outliers
plt.figure(figsize=(12, 6))
sns.boxplot(data=data[numerical_columns])
plt.title("Boxplot of Numerical Columns")
plt.xticks(rotation=45)
plt.show()

# Test assumptions: Check for normality in numerical columns

for col in numerical_columns:
    stat, p = shapiro(data[col].dropna())
    print(f"Shapiro-Wilk Test for {col}: Statistic={stat}, p-value={p}")
    if p > 0.05:
        print(f"{col} appears to be normally distributed.\n")
    else:
        print(f"{col} does not appear to be normally distributed.\n")

# Spot patterns, trends, and relationships
sns.pairplot(data[numerical_columns])
plt.suptitle("Pairplot of Numerical Columns", y=1.02)
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
correlation_matrix = data[numerical_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

# Prepare for further analysis: Check for missing values
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

# Prepare for further analysis: Feature scaling

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[numerical_columns])
scaled_df = pd.DataFrame(scaled_data, columns=numerical_columns)
print("\nScaled Data (First 5 Rows):")
print(scaled_df.head())