In [1]:
#Importing libraries
import pandas as pd
import os
from datetime import datetime
import numpy as np

In [2]:
#Importing constants used in the analysis
os.chdir(r'/Users/mac_air/Documents/Documents/Side Projects/Kaggle_Anomaly_Detection/Scripts/')
from constants import IDENTIFIERS


<h1> Merging Data </h1>

<h3> In this step, we are merging generation and weather data to make one ADS </h3>

In [5]:
def merge_data(plant_name):
    '''
    This function creates the ads for a plant
    Input:
    1. plant_name: Takes in the plant name as specified in the csv files for both the plants

    Return:
    ads: Merged dataset of generation ads and sensor ads with their dates formatted properly
    '''

    #reading in the file
    path = r'/Users/mac_air/Documents/Documents/Side Projects/Kaggle_Anomaly_Detection/'
    gen_data = pd.read_csv(path + '/data/{}_Generation_Data.csv'.format(plant_name))
    #formatting the date to date formate
    gen_data['DATE_TIME'] = gen_data['DATE_TIME'].apply(lambda x: datetime.strptime(x, "%d-%m-%Y %H:%M"))
    
    sns_weather_data = pd.read_csv(path + '/data/{}_Weather_Sensor_Data.csv'.format(plant_name))
    #formatting the date
    sns_weather_data['DATE_TIME'] = sns_weather_data['DATE_TIME'].apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
    
    #below line has to be uncommented when I am doing the missing timestamp treatment
    ads = pd.merge(gen_data, sns_weather_data, left_on=['DATE_TIME','PLANT_ID'], right_on=['DATE_TIME','PLANT_ID'],how='left')
    ads.rename(columns = {'SOURCE_KEY_x':'INVERTER_ID', 'SOURCE_KEY_y':'PANEL_ID'}, inplace=True)
    return ads


<h1> Creating target variable </h1>

<h3> In this step we are creating the target variable that is the amount of power generated per 15 min interval </h3>

In [8]:
def create_daily_yield(df):
    '''
    This function creates per timestamp yield generated at the plant using total yield column

    Input:
    1. df: pandas dataframe with all the required columns

    Return:
    1. df: pandas dataframe with the per timestamp yield
    '''
    df['PER_TS_YIELD'] = np.nan
    yield_df = df.groupby('INVERTER_ID')['TOTAL_YIELD'].agg(lambda x: list(x.diff())).to_frame().reset_index()

    for inverter in df['INVERTER_ID'].unique():
        df.loc[df['INVERTER_ID'] == inverter, 'PER_TS_YIELD'] = yield_df.loc[yield_df['INVERTER_ID'] == inverter,'TOTAL_YIELD'].values[0]

    return df


<h1> Feature Generation </h1>

<h3> Creating some helping features </h3>

In [11]:
def create_features(df):
    '''
    This function just calls the daily yield function as well as renames the date column
    '''
    #creating per timestamp yield
    df = create_daily_yield(df)
    df = df.rename(columns={'DATE_TIME':'DATE'})
    #creating time of day flag
    #night is 0, day is 1
    df['TIME_OF_DAY'] = df['DATE'].apply(lambda x: 1 if x.hour in range(6,18) else 0)

    return df


<h1> Missing Value Treatment </h1>

<h3> This is a very important step wherein we performed missing value treatment as well as undertook data cleaning </h3>
<h3> We followed the below steps for MVT and data cleaning </h3>
-----------------------------

<h3> 1. Whenever it is night time i.e. 6pm to 6am then Irraditation (power from sun), AC Power, DC Power and Power generated per 15 min interval should be 0 </h3>
<h3> 2. For missing values in  Daily Yield, Total Yield, Module Temperature, Ambient Temperature, the last value will be retained and we will use forward fill </h3>
<h3> 3.  During Day time if Irradiation, AC Power, DC Power, Per TS Yield is missing then that will be imputed using average values for these fields </h3>

In [14]:
def missing_value_treatment(df):
    '''
    Input:
    1. df: pandas dataframe that is subsetted for one inverter ID only

    Return:
    1. df: input pandas dataframe + missing values treated through forward filling and imputation with 0
    '''
    #Forward filling all the identifier columns
    df[IDENTIFIERS] = df[IDENTIFIERS].ffill()

    #At night time, irradiation, ac & dc power are 0
    night_idx = df.index[df['TIME_OF_DAY'] == 0].tolist()
    df.at[night_idx,['IRRADIATION','DC_POWER','AC_POWER','PER_TS_YIELD']] = 0

    #When there are no values, we assume that daily and total yield remains the same
    df[['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','DAILY_YIELD','TOTAL_YIELD']] \
        = df[['AMBIENT_TEMPERATURE','MODULE_TEMPERATURE','DAILY_YIELD','TOTAL_YIELD']].ffill()

    #all the incidents when irradiation & per time stamp yield is na
    #this would now be only day time
    na_idx = df.index[(np.isnan(df['IRRADIATION'])) | (np.isnan(df['PER_TS_YIELD'])) == True].tolist()

    #replacing the na values with average
    #Making the assumption that during day time irradiation can be the average amount 
    #consequently the power generated would also be average
    for attribute in ['IRRADIATION','AC_POWER','DC_POWER','PER_TS_YIELD']:
        df.at[na_idx,attribute] = df[attribute].mean()

    return df


<h1> Bringing it all together </h1>

In [16]:
def create_ads():
    '''
    This is a wrapper function for all the other functions in this module to ease ads creation 
    in other modules
    '''
    #creating ads for plant 1 and plant 2
    ads = pd.DataFrame()
    #merging sensor and weather data
    ads_plant_1 = merge_data('Plant_1')
    ads_plant_2 = merge_data('Plant_2')
    ads = pd.concat([ads_plant_1, ads_plant_2]).reset_index(drop=True)
    #creating date and per time stamp yields
    ads = create_features(ads)
    #performing missing value treatment
    ads = ads.groupby('INVERTER_ID').apply(lambda x: missing_value_treatment(x))
    ads = ads.reset_index(drop=True)
    return ads