<a href="https://colab.research.google.com/github/syphax/solar-data/blob/feb01/nb/Solar_Viz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro

This notebook analyses and visualizes output from my VT PV system.

To run with the provided data, you need Google Drive, and you need to copy the data from https://github.com/syphax/solar-data/tree/main/data to `/My Drive/Data/Solar`

_TODO: Load the data directly from the GitHub repo._

# Setup

In [None]:
import os

from datetime import datetime
import pytz

import numpy as np
import pandas as pd

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load Data

This loads data that was downloaded from [Green Mountain Power's website](https://greenmountainpower.com/account/usage/).

GMP has an excellent UI for reporting usage, and provides downloadable data in 15 minute increments (either CSV or Green Button XML). *Unfortunately* it only supports manual data downloads in 15 day (max) chunks. 

This data gets concatenated and cleaned in the `Clean_GMP_Solar_Data.ipynb` script.

Fields in the cleaned up dataset are:
* `Service`: Is this consumption, generation?
* `IntervalStart`: Timestamp; format is `yyyy-MM-dd-hh:mm:ss`
* `IntervalEnd`: Same, 15 minutes later. Redundant but explicit!
* `dt_start`: IntervalStart, converted to EST (correcting for DST)
* `dt_end`: IntervalEnd, converted to EST (correcting for DST)
* `Quantity`: Amount of electricity generated
* `UnitOfMeasure`: I love that they have an explicit UoM field!
* `kWh`: Quantity converted to kWh


## Prep

In [None]:
path = '/content/drive/MyDrive/Data/Solar/'

In [None]:
# This should list the data files that you copied from https://github.com/syphax/solar-data/tree/main/data
!ls $path 

In [None]:
# We do use the jointed script
joined_input_file = os.path.join(path, 'full_dataset.csv')


## Load

In [None]:
df_energy_data = pd.read_csv(joined_input_file)

In [None]:
from pytz import timezone

est = pytz.timezone('US/Eastern')

# TODO: Add timezone attribute (note: we adjusted for DST in the pre-process script)

for f in ['dt_start', 'dt_end']:
 df_energy_data[f] = pd.to_datetime(df_energy_data[f], utc=True).dt.tz_convert(est)


In [None]:
df_solar_data = df_energy_data[df_energy_data['Service']=='NGEN'].copy()

In [None]:
for c in ['dt_start', 'dt_end']:  

  # Compute diffs vs prior record
  df_solar_data['diff_' + c] = df_solar_data[c].diff()

  # Add year, month, day, hour, minutes:

  df_solar_data[c + '_year'] = df_solar_data[c].dt.year
  df_solar_data[c + '_month'] = df_solar_data[c].dt.month
  df_solar_data[c + '_week'] = df_solar_data[c].dt.isocalendar().week
  df_solar_data[c + '_2weeks'] = np.ceil(df_solar_data[c].dt.isocalendar().week / 2.0) # Useful resolution
  df_solar_data[c + '_day'] = df_solar_data[c].dt.day
  
  # Altair doesn't like date types very much, so this is basically a rounded datetime
  df_solar_data[c + '_dt'] = pd.to_datetime(df_solar_data[c].dt.date)
  
  df_solar_data[c + '_hour'] = df_solar_data[c].dt.hour
  df_solar_data[c + '_minute'] = df_solar_data[c].dt.minute

  df_solar_data[c + '_hrmin'] = df_solar_data[c + '_hour'] + df_solar_data[c + '_minute'] / 60.0


In [None]:
# 15 minute increment:

fmt = '%Y-%m-%d-%H:%M:%S'

d1 = datetime.strptime('2022-01-01-00:00:00', fmt)
d2 = datetime.strptime('2022-01-01-00:15:00', fmt)

diff_mins = (d2-d1)

In [None]:
# This should show the start of the series, plus some daylight savings impaired records!

df_solar_data[df_solar_data['diff_dt_start'] != diff_mins]

# Review

# Summarize

## Output by Day

In [None]:
df_solar_data_sums_by_day = df_solar_data.groupby(['dt_start_dt'], as_index=False).agg({'Quantity':'sum'})

df_solar_data_sums_by_day['rolling_07'] = df_solar_data_sums_by_day['Quantity'].rolling(window=7, center=True).mean()
df_solar_data_sums_by_day['rolling_14'] = df_solar_data_sums_by_day['Quantity'].rolling(window=14, center=True).mean()
df_solar_data_sums_by_day['rolling_28'] = df_solar_data_sums_by_day['Quantity'].rolling(window=28, center=True).mean()
df_solar_data_sums_by_day['rolling_60'] = df_solar_data_sums_by_day['Quantity'].rolling(window=60, center=True).mean()

In [None]:
# Time series by day with rolling 60 day average (centered)

alt.data_transformers.disable_max_rows()

ch_raw = alt.Chart(df_solar_data_sums_by_day).mark_line().encode(
    x=alt.X('dt_start_dt:T'),
    y=alt.Y('Quantity:Q')
)


ch_roll = alt.Chart(df_solar_data_sums_by_day).mark_line().encode(
    x=alt.X('dt_start_dt:T'),
    y=alt.Y('rolling_60:Q'),
    color=alt.value("#220099")
)

ch = ch_raw + ch_roll

ch.display()

In [None]:
# Distribution of kWh by day, Narrow bins

sns.histplot(data=df_solar_data_sums_by_day, x="Quantity", binwidth = 1)

In [None]:
# Distribution of kWh by day, Wide bins

sns.histplot(data=df_solar_data_sums_by_day, x="Quantity", binwidth = 4)

In [None]:
df_solar_data_sums_by_hour = df_solar_data.groupby(['dt_start_dt', 'dt_start_hour'], as_index=True).agg({'Quantity':'sum'})

In [None]:
# Histogram of production by hour: 

sns.histplot(data=df_solar_data_sums_by_hour, x="Quantity", binwidth = 0.2)

In [None]:
# Lots of zero hours (night, etc)- let's remove those:

sns.histplot(data=df_solar_data_sums_by_hour[df_solar_data_sums_by_hour['Quantity'] > 0.0], x="Quantity", binwidth = 0.2)

In [None]:
df_solar_hourly_table = df_solar_data_sums_by_hour.unstack(0).droplevel(0, axis=1)

In [None]:
# Plot heatmap of date x hour of day
# TODO: Need to format dates on x-axis!
# TODO: Need to flip y-axis

fig, ax = plt.subplots(figsize = (18,6))    
fig = sns.heatmap(df_solar_hourly_table)


In [None]:
df_solar_data_sums_by_hour_un = df_solar_data_sums_by_hour.reset_index()

In [None]:
# Average production by hour of day; no adjustments for start, end dates of series:

# 2 season - Winter = October - March, inclusive (not quite aligned to equinoxes)

df_solar_data_sums_by_hour_un['season_ws'] = 'Winter'
df_solar_data_sums_by_hour_un['season_ws'] = np.where((df_solar_data_sums_by_hour_un['dt_start_dt'].dt.month > 3) & 
                                                      (df_solar_data_sums_by_hour_un['dt_start_dt'].dt.month <=9), 
                                                      'Summer', df_solar_data_sums_by_hour_un['season_ws'])

# Overall sum by hour
df_avg_by_hour_all = df_solar_data_sums_by_hour_un.groupby('dt_start_hour', as_index=False).agg({'Quantity':'mean'})
df_avg_by_hour_all['season_ws'] = 'All'

df_avg_by_hour_ws = df_solar_data_sums_by_hour_un.groupby(['season_ws', 'dt_start_hour'], as_index=False).agg({'Quantity':'mean'})

# Append the two together
df_avg_by_hour_ws = pd.concat([df_avg_by_hour_all, df_avg_by_hour_ws], axis=0)

#df_avg_by_hour_ws.groupby(['season_ws']).agg({'dt_start_hour':'count', 'Quantity':'sum'})

display(df_avg_by_hour_ws.groupby(['dt_start_hour', 'season_ws']).agg({'Quantity':'sum'}).unstack(-1))

In [None]:
# Plot average daily trend

df_chart = df_avg_by_hour_ws.reset_index(drop=True) 

fig, ax = plt.subplots(figsize = (18,6))    

fig = sns.lineplot(data=df_chart, x='dt_start_hour', y='Quantity', hue='season_ws')

In [None]:
df_chart

# Extras

In [None]:
df_ch = df_solar_data[['dt_start', 'dt_start_year', 'dt_start_month', 'dt_start_day', 
                       'dt_start_hour', 'dt_start_minute', 'dt_start_hrmin','Quantity']]

In [None]:
df_solar_data_sums_by_day.tail(7)

In [None]:
# Altair version (doesn't render in GitHub)

# Distribution of outputs by day (small bins)

ch_daily_hist = alt.Chart(df_solar_data_sums_by_day).mark_bar().encode(
    x = alt.X('Quantity', title='kWh', bin=alt.Bin(extent=[0, 40], step=1)), 
    y = alt.Y('count()', title='Days'))

ch_daily_hist.display()

In [None]:
# Distribution of outputs by day (larger bins)

ch_daily_hist = alt.Chart(df_solar_data_sums_by_day).mark_bar().encode(
    x = alt.X('Quantity', title='kWh', bin=alt.Bin(extent=[0, 40], step=4)), 
    y = alt.Y('count()', title='Days'))

ch_daily_hist.display()