<a href="https://colab.research.google.com/github/wangyouzhong/MLStudy/blob/master/Power_Prediction_MultiTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Data

In [1]:
import io
import pandas as pd

from google.colab import files

import numpy as np
from datetime import date, timedelta

In [2]:
def data_preprocess(df, level):
  df.columns = ['Id', 'Date'] + [f'TB{i}_L{level}' for i in range(1, 97)]
  df['Date'] = pd.to_datetime(df['Date']).dt.date
  df_max = df.max()
  df_min = df.min()
  for i in range(1, 97):
    df_max_i = df_max[f'TB{i}_L{level}']
    df_min_i = df_min[f'TB{i}_L{level}']
    df[f'TB{i}_L{level}'] = (df[f'TB{i}_L{level}'] - df_min_i) / (df_max_i - df_min_i)
  return df, df_max, df_min

def data_shift_days(df, days_delta, level):
  df['Date'] = df['Date'] + pd.Timedelta(days=days_delta)
  df.columns = ['Id', 'Date'] + [f'TB{i}_L{level}_d{days_delta}' for i in range(1, 97)]
  return df

In [3]:
uploaded = files.upload()

Saving 131151000000000965.csv to 131151000000000965 (1).csv


In [4]:
df_l1 = pd.read_csv(io.BytesIO(uploaded['131151000000000965.csv']), header=None)
df_l1, df_l1_max, df_l1_min = data_preprocess(df_l1, 1)

In [5]:
uploaded = files.upload()

Saving L3.csv to L3 (1).csv


In [6]:
df_l3 = pd.read_csv(io.BytesIO(uploaded['L3.csv']))
df_l3, df_l3_max, df_l3_min = data_preprocess(df_l3, 3)

In [7]:
def build_train_data(df1, df3, start_ds, end_ds, shift_days):
  df3_copy = pd.DataFrame.copy(df3[(df3['Date']>=start_ds) & (df3['Date']<=end_ds)])
  
  # Join L3 data with L1 data
  df3_copy = df3_copy.set_index('Date')
  df1_copy = pd.DataFrame.copy(df1[(df1['Date']>=start_ds) & (df1['Date']<=end_ds)]).set_index('Date')
  df3_copy = df3_copy.join(df1_copy.drop(columns=['Id']))
  # Join L3 data with L1 previous days data
  for i in range(1, shift_days+1):
    df_shift = pd.DataFrame.copy(df1[(df1['Date']>=(start_ds + timedelta(days=-i))) & (df1['Date']<=(end_ds + timedelta(days=-i)))])
    df_shift = data_shift_days(df_shift, i, 1).set_index('Date')
    df3_copy = df3_copy.join(df_shift.drop(columns=['Id']))
  
  # Join L3 data with L3 previous days data
  df3_copy = df3_copy.reset_index().set_index(['Date', 'Id'])
  for i in range(1, shift_days+1):
    df_shift = pd.DataFrame.copy(df3[(df3['Date']>=(start_ds + timedelta(days=-i))) & (df3['Date']<=(end_ds + timedelta(days=-i)))])
    df_shift = data_shift_days(df_shift, i, 3).set_index(['Date', 'Id'])
    df3_copy = df3_copy.join(df_shift)
  
  # Expand training data for 96 time buckets
  train_data_list = []
  for tb in range(1, 97):
    train_data = pd.DataFrame.copy(df3_copy)
    train_data['label'] = df_l3_min[f'TB{tb}_L3'] + train_data[f'TB{tb}_L3'] * (df_l3_max[f'TB{tb}_L3'] - df_l3_min[f'TB{tb}_L3'])
    for tb2 in range(1, 97):
      if tb2 == tb:
        train_data[f'TB{tb2}_L3'] = 1
      else:
        train_data[f'TB{tb2}_L3'] = 0
      if tb2 >= tb:
        train_data[f'TB{tb2}_L1'] = 0
    train_data_list.append(train_data)
  return pd.concat(train_data_list)

In [8]:
TRAIN_DS_START = date(2019, 2, 1)
TRAIN_DS_END = date(2019, 2, 28)
VALIDATE_DS_START = date(2019, 3, 10)
VALIDATE_DS_END = date(2019, 3, 13)

NUM_PREVIOUS_DAYS = 7

In [9]:
train_data = build_train_data(df_l1, df_l3, TRAIN_DS_START, TRAIN_DS_END, NUM_PREVIOUS_DAYS)
num_features = train_data.shape[1]-1
train_x, train_y = train_data.values[:,:num_features].astype('float32'), train_data.values[:,num_features].astype('float32')

In [10]:
validate_data = build_train_data(df_l1, df_l3, VALIDATE_DS_START, VALIDATE_DS_END, NUM_PREVIOUS_DAYS)
validate_x, validate_y = validate_data.values[:,:num_features].astype('float32'), validate_data.values[:,num_features].astype('float32')

# Build A DNN Model 

In [11]:
import tensorflow as tf 


In [12]:
model = tf.keras.Sequential(
    layers = [
        tf.keras.layers.Dense(512, activation='relu', input_shape=(num_features,)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1),
    ]
)

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               786944    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               1

In [14]:
model.compile(optimizer='adam',loss=tf.keras.losses.mean_squared_error, metrics=['mae'])

In [15]:
model.fit(x=train_x, y=train_y, batch_size=100, epochs = 10, validation_data = (validate_x, validate_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fda200f3b38>

# Evaluate the Model


In [30]:
# Model prediction
predicted_validate_y = model.predict(validate_x, batch_size = 100).flatten()
if 'predicted_value' in validate_data.columns:
  validate_data = validate_data.drop(columns=['predicted_value'])
validate_data.insert(loc = 0, column='predicted_value', value = predicted_validate_y)


In [None]:
# Reduce to Date-Id level
validate_data_groupby = validate_data.groupby(['Date', 'Id'])
reduced_validated_data = validate_data_groupby.apply(
    lambda x: np.sum(x['TB1_L3']*validate_data['predicted_value'])).to_frame(
        name='Predicted_TB1_L3').reset_index().set_index(['Date', 'Id'])
for i in range(2, 97):
  print(f'Join predicted value {i}')
  reduced_validated_data = reduced_validated_data.join(
      validate_data_groupby.apply(
        lambda x: np.sum(x[f'TB{i}_L3']*validate_data['predicted_value'])).to_frame(
            name=f'Predicted_TB{i}_L3').reset_index().set_index(['Date', 'Id'])
  )
for i in range(1, 97):
  print(f'Join raw value {i}')
  reduced_validated_data = reduced_validated_data.join(
      validate_data_groupby.apply(
        lambda x: np.sum(x[f'TB{i}_L3']*validate_data['label'])).to_frame(
            name=f'TB{i}_L3').reset_index().set_index(['Date', 'Id'])
  )
reduced_validated_data

Join predicted value 2
Join predicted value 3
Join predicted value 4


In [None]:
reduced_validated_data.iloc[0]

In [None]:
import matplotlib.pyplot as plt
x = list(range(1, 97))
row0 = reduced_validated_data.iloc[20]
y0 = [row0[f'TB{i}_L3'] for i in x]
y1 = [row0[f'Predicted_TB{i}_L3'] for i in x]
plt.plot(x, y0, 'b')
plt.plot(x, y1, 'r')
plt.show()


In [None]:
reduced_validated_data