**Data Cleaning and Pre-processing**



---

In this notebook, a series of Cleaning and Preprocessing steps will be conducted.

- Checking for Missing Values
- Checking for Null Values



---



#Importing Dependencies

In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Loading the Data

In [109]:
data = pd.read_csv('/content/DailyEnergyDemandData.csv')

In [110]:
data.shape

(2440, 2)

In [111]:
data

Unnamed: 0,Date,Demand
0,2015-07-01,20243
1,2015-07-02,19275
2,2015-07-03,17682
3,2015-07-04,16523
4,2015-07-05,17620
...,...,...
2435,2022-03-01,18069
2436,2022-03-02,16941
2437,2022-03-03,17313
2438,2022-03-04,17953


In [112]:
data['Demand'] = data['Demand'].astype(int)

In [113]:
data

Unnamed: 0,Date,Demand
0,2015-07-01,20243
1,2015-07-02,19275
2,2015-07-03,17682
3,2015-07-04,16523
4,2015-07-05,17620
...,...,...
2435,2022-03-01,18069
2436,2022-03-02,16941
2437,2022-03-03,17313
2438,2022-03-04,17953


In [114]:
data.columns = ['Date', 'Demand']


#Data Cleaning



> Removing Missing and Null Values





In [115]:
data.isnull().sum()

Date      0
Demand    0
dtype: int64

In [116]:
data.replace({'?' : np.nan,'':np.nan, " ": np.nan, '-' :np.nan, '--':np.nan}, inplace = True)

In [117]:
data.isna().sum()

Date      0
Demand    0
dtype: int64

In [118]:
data.dropna(axis = 0, inplace= True)

#Data Preprocessing





##Split Data into Training and Validation Sets

In [121]:
data['Date'] = pd.to_datetime(data['Date'])

In [123]:
dataset = data[data['Date'].dt.year != 2022]
validation = data[data['Date'].dt.year == 2022]

##Set 'Date' Column as Index:

In [124]:
dataset = dataset.set_index('Date')
validation = validation.set_index('Date')

## Converting Data into Series

In [125]:
dataset = dataset.squeeze()
dataset

Date
2015-07-01    20243
2015-07-02    19275
2015-07-03    17682
2015-07-04    16523
2015-07-05    17620
              ...  
2021-12-27    17661
2021-12-28    17164
2021-12-29    17162
2021-12-30    16732
2021-12-31    15852
Name: Demand, Length: 2376, dtype: int64

In [126]:
validation = validation.squeeze()
validation

Date
2022-01-01    15223
2022-01-02    15914
2022-01-03    18620
2022-01-04    18918
2022-01-05    18288
              ...  
2022-03-01    18069
2022-03-02    16941
2022-03-03    17313
2022-03-04    17953
2022-03-05    16260
Name: Demand, Length: 64, dtype: int64

In [127]:
print('Dataset %d, Validation %d' % (len(dataset), len(validation)))


Dataset 2376, Validation 64


# Saving Cleaned Data to csv

In [129]:
# data.to_csv("DailyEnergyDemandDataCleaned.csv")
dataset.to_csv('energydemand_training.csv')
validation.to_csv('energydemand_validation.csv')