In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go


In [None]:
# Load the dataset
df = pd.read_csv('../data/sierraleone-bumbuna.csv')
df.head()


In [None]:
# Summary statistics for numeric columns
df.describe()

In [None]:
# Missing values count
missing = df.isna().sum()
missing[missing > 0]

In [None]:
# Columns with more than 5% nulls
missing_ratio = df.isna().mean()
missing_ratio[missing_ratio > 0.05]

In [None]:
# Define key columns
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB','Tamb','RH','WS','WSgust',	'WSstdev','WD','WDstdev','BP','Cleaning','Precipitation','TModA','TModB'
]

# Compute Z-scores
z_scores = np.abs(stats.zscore(df[cols_to_check].dropna()))
outlier_flags = (z_scores > 3)

# Count rows with any outliers
df_outliers = df[(outlier_flags).any(axis=1)]
print(f'Outlier rows: {len(df_outliers)}')

In [None]:
# Impute with median (only for demo — use domain knowledge in real cases)
df_clean = df.copy()
for col in cols_to_check:
    if df_clean[col].isna().sum() > 0:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())


In [None]:
df_clean.to_csv('../data/sierraleone_clean.csv', index=False)

In [None]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])

plt.figure(figsize=(14, 6))
df_clean.set_index('Timestamp')[['GHI', 'DNI', 'DHI','Tamb',]].plot()
plt.title('GHI,DNI,DHI, Tamb over Time')
plt.ylabel('Value')
plt.xlabel('Time')
plt.grid(True)
plt.show()

In [None]:
if 'Cleaning' in df_clean.columns:
    df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean().plot(kind='bar')
    plt.title('ModA & ModB by Cleaning')
    plt.ylabel('Module Reading')
    plt.grid(True)
    plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df_clean[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
sns.scatterplot(data=df_clean, x='WS', y='GHI')
plt.title('WS vs GHI')
plt.grid(True)
plt.show()


In [None]:
sns.scatterplot(data=df_clean, x='RH', y='Tamb')
plt.title('RH vs Tamb')
plt.grid(True)
plt.show()

In [None]:
# Requires windrose library: pip install windrose
from windrose import WindroseAxes

ax = WindroseAxes.from_ax()
ax.bar(df_clean['WD'], df_clean['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
plt.title('Wind Rose')
plt.show()


In [None]:
df_clean['GHI'].hist(bins=30, figsize=(6,4))
plt.title('Histogram of GHI')
plt.grid(True)
plt.show()


In [None]:
df_clean['WS'].hist(bins=30, figsize=(6,4))
plt.title('Histogram of WS')
plt.grid(True)
plt.show()

In [None]:
fig = px.scatter(df_clean, x='GHI', y='Tamb',
                 size='RH', color='RH', 
                 title='GHI vs Tamb (Bubble size = RH)')
fig.show()
