In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Load the data
df = pd.read_csv('../data/sierraleone-bumbuna.csv', parse_dates=['Timestamp'])

# --- Summary Statistics & Missing Values ---
display(df.describe())
missing = df.isna().sum()
print("Columns with missing values > 5%:")
print((missing / len(df) * 100)[(missing / len(df)) > 0.05])

# --- Outlier Detection ---
z_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[z_cols].apply(zscore)
df['outlier'] = (z_scores.abs() > 3).any(axis=1)

# --- Cleaning: Impute or drop ---
for col in z_cols:
    df[col].fillna(df[col].median(), inplace=True)

df_clean = df[~df['outlier']].copy()

# --- Export Cleaned Data ---
df_clean.to_csv('../data/sierraleone_clean.csv', index=False)

# --- Time Series Plot ---
df.set_index('Timestamp')[['GHI', 'DNI', 'DHI', 'Tamb']].plot(figsize=(14,6), title='Solar Irradiance and Temp Over Time')

# --- Cleaning Effect ---
df['cleaned'] = ~df['outlier']
df.groupby('cleaned')[['ModA', 'ModB']].mean().plot(kind='bar', title='Average ModA and ModB Pre/Post Cleaning')

# --- Correlation Heatmap ---
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
sns.heatmap(df_clean[corr_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')

# --- Scatter Plots ---
sns.scatterplot(x='WS', y='GHI', data=df_clean)
plt.title('Wind Speed vs GHI')
plt.figure()
sns.scatterplot(x='RH', y='Tamb', data=df_clean)
plt.title('RH vs Tamb')

# --- Histograms ---
df_clean['GHI'].hist(bins=30)
plt.title('GHI Histogram')
plt.figure()
df_clean['WS'].hist(bins=30)
plt.title('WS Histogram')

# --- Bubble Chart ---
plt.scatter(df_clean['GHI'], df_clean['Tamb'], s=df_clean['RH'], alpha=0.5)
plt.title('GHI vs Tamb (Bubble = RH)')
plt.xlabel('GHI')
plt.ylabel('Tamb')