# FMI-Kumpula | Solar Power

In [1]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objs as go
import plotly.express as ex
from plotly.offline import iplot, init_notebook_mode
import cufflinks
cufflinks.go_offline(connected = True)
init_notebook_mode(connected = True)

# 1. Read and Preprocessing Data

In [2]:
#Import data and print samples
X = pd.read_csv('FMI-Kumpula.csv')
X.sample(10)

Unnamed: 0,request_id,forecast_time,power_output_w,power_output_f0_w,power_output_f10_w,power_output_f25_w,power_output_f50_w,power_output_f75_w,power_output_f90_w,power_output_f100_w,...,cloud_cover_total,cloud_cover_high,cloud_cover_medium,cloud_cover_low,system_radiation_global_wm2,system_radiation_direct_wm2,system_radiation_diffuse_wm2,radiation_global_wm2,radiation_direct_wm2,radiation_diffuse_wm2
200,4405,2020-10-11 12:00:00+03,901.8,75.6,117.2,189.8,313.2,541.5,822.3,1664.6,...,100.0,100.0,88.9,99.9,59.14,3.91,55.11,58.45,7.0,56.07
261,4405,2020-10-14 01:00:00+03,,,,,,,,,...,18.9,0.0,0.0,18.9,,,,0.0,0.0,0.0
240,4405,2020-10-13 04:00:00+03,,,,,,,,,...,100.0,100.0,100.0,53.4,,,,0.0,0.0,0.0
107,4447,2020-10-15 06:00:00+03,,,,,,,,,...,27.5,0.0,0.0,27.5,,,,0.0,0.0,0.0
264,4357,2020-10-06 01:00:00+03,,,,,,,,,...,100.0,24.1,0.1,100.0,,,,0.0,0.0,0.0
297,4357,2020-10-07 10:00:00+03,1676.5,287.8,449.9,689.1,1114.7,2057.1,2659.1,3127.3,...,99.4,0.0,99.4,8.1,98.16,11.29,86.62,82.72,25.46,77.56
65,4492,2020-10-22 09:00:00+03,0.0,,,,,,,,...,100.0,100.0,100.0,100.0,0.13,0.0,0.13,0.13,0.0,0.13
21,4492,2020-10-20 13:00:00+03,7869.9,6959.5,7757.5,7822.9,7870.0,7914.5,7932.8,7974.6,...,0.4,0.0,0.0,0.3,380.34,113.65,265.77,300.8,222.43,228.61
244,4405,2020-10-13 08:00:00+03,,,,,,,,,...,100.0,0.0,100.0,99.7,,,,0.0,0.0,0.0
228,4405,2020-10-12 16:00:00+03,2305.8,867.8,1269.4,1703.3,2201.7,2904.5,3476.5,3903.2,...,100.0,99.8,100.0,54.4,128.49,6.92,121.28,128.61,22.57,122.2


In [3]:
print('------------- COLUMN TYPE -------------')
print(X.dtypes)

------------- COLUMN TYPE -------------
request_id                        int64
forecast_time                    object
power_output_w                  float64
power_output_f0_w               float64
power_output_f10_w              float64
power_output_f25_w              float64
power_output_f50_w              float64
power_output_f75_w              float64
power_output_f90_w              float64
power_output_f100_w             float64
system_temperature_c            float64
nominal_output_efficiency       float64
air_temperature_c               float64
cloud_cover_total               float64
cloud_cover_high                float64
cloud_cover_medium              float64
cloud_cover_low                 float64
system_radiation_global_wm2     float64
system_radiation_direct_wm2     float64
system_radiation_diffuse_wm2    float64
radiation_global_wm2            float64
radiation_direct_wm2            float64
radiation_diffuse_wm2           float64
dtype: object


In [4]:
#Change column forecast_time to date time format
X['forecast_time'] = pd.to_datetime(X['forecast_time'], format = "%Y-%m-%d %H:%M:%S")
print('--------------------- COLUMN TYPE AFTER CHANGING --------------------')
print(X.dtypes)

--------------------- COLUMN TYPE AFTER CHANGING --------------------
request_id                                                      int64
forecast_time                   datetime64[ns, pytz.FixedOffset(180)]
power_output_w                                                float64
power_output_f0_w                                             float64
power_output_f10_w                                            float64
power_output_f25_w                                            float64
power_output_f50_w                                            float64
power_output_f75_w                                            float64
power_output_f90_w                                            float64
power_output_f100_w                                           float64
system_temperature_c                                          float64
nominal_output_efficiency                                     float64
air_temperature_c                                             float64
cloud_cover_total   

In [5]:
#The request_id column seems to hold no information
#Remove reques_id column and set forecast_time to be the index column
X.drop('request_id', inplace = True, axis = 1)
X.set_index(['forecast_time'], inplace = True)
X.sample(5)

Unnamed: 0_level_0,power_output_w,power_output_f0_w,power_output_f10_w,power_output_f25_w,power_output_f50_w,power_output_f75_w,power_output_f90_w,power_output_f100_w,system_temperature_c,nominal_output_efficiency,...,cloud_cover_total,cloud_cover_high,cloud_cover_medium,cloud_cover_low,system_radiation_global_wm2,system_radiation_direct_wm2,system_radiation_diffuse_wm2,radiation_global_wm2,radiation_direct_wm2,radiation_diffuse_wm2
forecast_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-21 23:00:00+03:00,,,,,,,,,7.1,,...,100.0,100.0,98.8,100.0,,,,0.0,0.0,0.0
2020-10-15 06:00:00+03:00,,,,,,,,,5.1,,...,27.5,0.0,0.0,27.5,,,,0.0,0.0,0.0
2020-10-22 00:00:00+03:00,,,,,,,,,7.5,,...,100.0,100.0,68.2,100.0,,,,0.0,0.0,0.0
2020-10-15 23:00:00+03:00,,,,,,,,,3.45,,...,100.0,100.0,79.9,0.1,,,,0.0,0.0,0.0
2020-10-07 19:00:00+03:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.96,0.0,...,100.0,47.3,100.0,0.1,0.5,0.0,0.5,0.51,0.01,0.51


## 2. Explore data

In [6]:
print('Shape =', X.shape)
print()
print('NaN value at each column:')
print(X.isna().sum())

Shape = (330, 21)

NaN value at each column:
power_output_w                  195
power_output_f0_w               213
power_output_f10_w              213
power_output_f25_w              213
power_output_f50_w              213
power_output_f75_w              213
power_output_f90_w              213
power_output_f100_w             213
system_temperature_c              0
nominal_output_efficiency       195
air_temperature_c                 0
cloud_cover_total                 0
cloud_cover_high                  0
cloud_cover_medium                0
cloud_cover_low                   0
system_radiation_global_wm2     195
system_radiation_direct_wm2     195
system_radiation_diffuse_wm2    195
radiation_global_wm2              0
radiation_direct_wm2              0
radiation_diffuse_wm2             0
dtype: int64


There are total 330 rows, but quite a number of rows have missing values.

The time is not in the right order. It needs to be sorted.

In [7]:
X = X.sort_values('forecast_time')

In [8]:
X.describe().round(3)

Unnamed: 0,power_output_w,power_output_f0_w,power_output_f10_w,power_output_f25_w,power_output_f50_w,power_output_f75_w,power_output_f90_w,power_output_f100_w,system_temperature_c,nominal_output_efficiency,...,cloud_cover_total,cloud_cover_high,cloud_cover_medium,cloud_cover_low,system_radiation_global_wm2,system_radiation_direct_wm2,system_radiation_diffuse_wm2,radiation_global_wm2,radiation_direct_wm2,radiation_diffuse_wm2
count,135.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,330.0,135.0,...,330.0,330.0,330.0,330.0,135.0,135.0,135.0,330.0,330.0,330.0
mean,2620.769,1005.557,1425.832,1928.747,2533.238,3122.935,3479.615,3718.249,9.749,0.738,...,72.67,41.888,31.789,51.182,139.237,24.556,114.352,50.876,22.888,43.78
std,2460.875,1504.317,1699.427,1937.933,2231.488,2542.896,2715.003,2811.739,5.165,0.315,...,36.554,44.123,43.148,41.459,119.367,34.203,89.91,88.537,51.137,73.66
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.6,0.0,...,0.0,0.0,0.0,0.0,0.13,0.0,0.13,0.0,0.0,0.0
25%,427.5,107.8,171.4,258.0,399.0,541.5,716.8,903.6,6.2,0.629,...,43.2,0.0,0.0,5.65,34.005,0.035,33.41,0.0,0.0,0.0
50%,2020.6,407.4,886.1,1508.9,2201.7,2793.9,3395.2,3779.5,9.175,0.888,...,96.4,17.25,0.3,47.0,114.22,6.92,102.5,0.0,0.0,0.0
75%,4533.8,1222.8,1936.5,2918.1,4058.4,5349.5,5920.9,6484.4,13.435,0.97,...,100.0,96.525,90.7,98.6,235.005,31.895,199.915,61.8,12.635,58.892
max,8327.5,6959.5,7757.5,7822.9,8157.7,8295.6,8354.3,8384.9,21.83,1.034,...,100.0,100.0,100.0,100.0,407.28,132.8,362.73,326.25,246.78,319.94


## 3. Visualizing Data