# Exploratory Data Analyisis for Togo dataset

In [None]:
import sys
sys.path.append('../scripts')


In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from scipy import stats
from utils import negative_values, missing_values

In [None]:
# Load_data
df = pd.read_csv('../data/raw/togo.csv')
df.head()


## Summary Statistics 

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.dtypes

## Data Quality Check 

In [None]:
# Missing values 
missing_values(df)




In [None]:
# Incorrect_entries 

# Columns with negative values
column_negative =negative_values(df)
column_negative

In [None]:
# Check for Outliers 

outliers=[]
def detect_outliers(data):
    
    threshold=3
    mean = np.mean(data)
    std =np.std(data)
    
    
    for i in data:
        z_score= (i - mean)/std 
        if np.abs(z_score) > threshold:
            outliers.append(i)
    return outliers

check_list = ['ModA', 'ModB', 'WS', 'WSgust']
for i in df[check_list]:
  outlier_pt=detect_outliers(df[i])
  sorted_outliers=sorted(outlier_pt)
  print('number of outliers in ', i, len(outlier_pt))
  print(outlier_pt)


## Data Cleaning 

In [None]:
# missing value
dff = df.dropna(axis=1)
# negative values 
## we have found the column with negative values is GHI 

dff.loc[dff['GHI'] < 0, 'GHI'] = 0


In [None]:
dff.head()

In [None]:
# For analysis of time series we change the object datatype of timestamp to datetime 

# Convert to datetime64[ns] data type
dff['Timestamp']= pd.to_datetime(dff['Timestamp'])

# Format the 'date' column using dt.strftime()
dff['formatted_date'] = dff['Timestamp'].dt.strftime('%Y-%m-%d %H:%M')

# Reindex the DataFrame using the 'date' column
dff = dff.set_index('Timestamp')


In [None]:
dff.head()

## Time Series Analysis 

In [None]:
# For GHI
plt.figure(figsize=(12, 6))
plt.plot(dff.index, dff['GHI'], label='Solar GHI')
plt.title('Solar Radiation Over Time')
plt.xlabel('Time')
plt.ylabel('GHI')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# For DNI
plt.figure(figsize=(12, 6))
plt.plot(dff.index, dff['DNI'], label='Solar DNI')
plt.title('Solar Radiation Over Time')
plt.xlabel('Time')
plt.ylabel('DNI')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# For DHI
plt.figure(figsize=(12, 6))
plt.plot(dff.index, dff['DHI'], label='Solar DHI')
plt.title('Solar Radiation Over Time')
plt.xlabel('Time')
plt.ylabel('DHI')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# For Tamb
plt.figure(figsize=(12, 6))
plt.plot(dff.index, dff['Tamb'], label='Ambient Temperature')
plt.title('Temperature Over Time')
plt.xlabel('Time')
plt.ylabel('Tamb')
plt.legend()
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:

# Select relevant columns for correlation analysis
subset_df = dff[['GHI', 'DNI', 'DHI', 'TModA', 'TModB']]

# Calculate the correlation matrix
correlation_matrix = subset_df.corr()

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Solar Radiation and Temperature')
plt.show()

# Create a pair plot
sns.pairplot(subset_df)
plt.suptitle('Pair Plot of Solar Radiation and Temperature', y=1.02)
plt.show()

In [None]:
# Investigate the relationship between wind conditions (WS, WSgust, WD) and solar irradiance using scatter matrices

# Select columns for analysis
wind_solar_rn = dff[['WS', 'WSgust', 'WD', 'GHI', 'DNI', 'DHI']]

# Create a scatter matrix
scatter_matrix(wind_solar_rn, alpha=0.5, figsize=(10, 10), diagonal='kde')
plt.suptitle('Scatter Matrix of Wind Conditions and Solar Irradiance', y=1.02)
plt.show()

## Wind Analysis 

In [None]:
# Identify trends and significant wind events using Polar plots
# Extract wind speed, wind gust, and wind direction data
ws = dff['WS'].values
wd = dff['WD'].values
wdstd = dff['WDstdev'].values
wsstd = dff['WSstdev'].values

# Calculate wind direction variability (standard deviation)
wd_std = np.zeros_like(wd)  # Initialize array for standard deviation
window_size = 24  # Number of hours for the rolling window

for i in range(window_size, len(wd)):
    wd_std[i] = np.std(wd[i - window_size:i])

# Create polar plot
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='polar')

# Plot wind speed and direction
ax.scatter(np.deg2rad(wd), ws, c=wdstd, cmap='viridis', alpha=0.7, s=10)

# Set plot properties
ax.set_theta_zero_location('N')  # Set north at the top
ax.set_theta_direction(-1)  # Clockwise direction
ax.set_title('Wind Speed and Direction ')
ax.set_xlabel('Wind Direction')
ax.set_ylabel('Wind Speed')

# Add colorbar for wind direction variability
cbar = fig.colorbar(plt.cm.ScalarMappable(cmap='viridis'), ax=ax)


plt.show()

## Temperature Analysis

In [None]:
# Create scatter plots with regression lines
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Temperature vs. Relative Humidity
sns.regplot(x='RH', y='Tamb', data=dff, ax=axes[0], scatter_kws={'alpha':0.5})
axes[0].set_title('Temperature vs. Relative Humidity')
axes[0].set_xlabel('Relative Humidity')
axes[0].set_ylabel('Temperature')

# GHI vs. Relative Humidity
sns.regplot(x='RH', y='GHI', data=dff, ax=axes[1], scatter_kws={'alpha':0.5})
axes[1].set_title('GHI vs. Relative Humidity')
axes[1].set_xlabel('Relative Humidity')
axes[1].set_ylabel('GHI')

# DNI vs. Relative Humidity
sns.regplot(x='RH', y='DNI', data=dff, ax=axes[2], scatter_kws={'alpha':0.5})
axes[2].set_title('DNI vs. Relative Humidity')
axes[2].set_xlabel('Relative Humidity')
axes[2].set_ylabel('DNI')


plt.tight_layout()
plt.show()

## Histograms

In [None]:
# Select variables for histograms
variables = ['GHI', 'DNI', 'DHI', 'WS', 'TModA', 'TModB']

# Create histograms
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, var in enumerate(variables):
    sns.histplot(dff[var], ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {var}')
    axes[i].set_xlabel(var)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()