In [1]:
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline
plt.style.use('ggplot')


%store -r WORKDIR

if 'WORKDIR' not in dir():
    WORKDIR = 'C:/Users/thewr/git/mit_data_science.git/'


data_raw_file = WORKDIR + '/Data/Raw/energy_consumption_data_modeling.parquet'
data_proc_file = WORKDIR + '/Data/Processed/energy_consumption_data_modeling.parquet'

dataset_max_date  = '2006-12-30'
dataset_min_date = '2006-01-01'


pd.plotting.register_matplotlib_converters()

# Leitura dos Dados 

In [2]:
pjme = pd.read_parquet(data_raw_file)

print('shape:', pjme.shape)
print('columns:', pjme.columns)
pjme.head()

shape: (145366, 2)
columns: Index(['Datetime', 'PJME_MW'], dtype='object')


Unnamed: 0,Datetime,PJME_MW
0,2002-01-01 01:00:00,30393
1,2002-01-01 02:00:00,29265
2,2002-01-01 03:00:00,28357
3,2002-01-01 04:00:00,27899
4,2002-01-01 05:00:00,28057


In [3]:
pjme = pjme.set_index('Datetime')

In [4]:
pjme.head()

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2002-01-01 01:00:00,30393
2002-01-01 02:00:00,29265
2002-01-01 03:00:00,28357
2002-01-01 04:00:00,27899
2002-01-01 05:00:00,28057


In [5]:
pjme = pjme[(pjme.index > dataset_min_date) & (pjme.index < dataset_max_date)]

In [6]:
pjme.head()

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2006-01-01 00:00:00,30293
2006-01-01 01:00:00,28884
2006-01-01 02:00:00,27556
2006-01-01 03:00:00,26484
2006-01-01 04:00:00,25822


In [7]:
pjme.tail()

Unnamed: 0_level_0,PJME_MW
Datetime,Unnamed: 1_level_1
2006-12-29 19:00:00,36728
2006-12-29 20:00:00,36168
2006-12-29 21:00:00,35405
2006-12-29 22:00:00,34117
2006-12-29 23:00:00,32157


In [8]:
pjme = pjme.reset_index()

In [9]:
pjme['Datetime'] = pjme['Datetime'].astype('datetime64[ns]')

In [10]:
pjme_duplicados = pjme[pjme.duplicated(subset=['Datetime'])]

In [11]:
pjme_duplicados.count()

Datetime    0
PJME_MW     0
dtype: int64

In [12]:
pjme = pjme.drop_duplicates(subset=['Datetime'], keep='first')

In [13]:
pjme_duplicados = pjme[pjme.duplicated(subset=['Datetime'])]

In [14]:
pjme_duplicados.count()


Datetime    0
PJME_MW     0
dtype: int64

# Estatistica dos dados 

In [57]:
pjme.describe()

Unnamed: 0,PJME_MW
count,8710.0
mean,32425.722618
std,6597.217655
min,19690.0
25%,28016.0
50%,31938.5
75%,35368.0
max,62009.0


In [58]:
pjme.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8710 entries, 0 to 8709
Data columns (total 2 columns):
Datetime    8710 non-null datetime64[ns]
PJME_MW     8710 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 204.1 KB


# Correção do tipo dos dados

In [59]:
pjme['Datetime'] = pjme['Datetime'].astype('datetime64[ns]')

data_types = {
    'PJME_MW': float    
}
for cname, dtype in data_types.items():
    if dtype == int:
        pjme.loc[data[cname].astype(str) == '',cname] = '0'
    
    pjme[cname] = pjme[cname].astype(dtype)
    
pjme.describe()

Unnamed: 0,PJME_MW
count,8710.0
mean,32425.722618
std,6597.217655
min,19690.0
25%,28016.0
50%,31938.5
75%,35368.0
max,62009.0


In [60]:
pjme.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8710 entries, 0 to 8709
Data columns (total 2 columns):
Datetime    8710 non-null datetime64[ns]
PJME_MW     8710 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 204.1 KB


In [61]:
pjme.tail()

Unnamed: 0,Datetime,PJME_MW
8705,2006-12-29 19:00:00,36728.0
8706,2006-12-29 20:00:00,36168.0
8707,2006-12-29 21:00:00,35405.0
8708,2006-12-29 22:00:00,34117.0
8709,2006-12-29 23:00:00,32157.0


# Verificar Valores Nulos

In [62]:
pjme.isnull().sum()

Datetime    0
PJME_MW     0
dtype: int64

# Exportar Base de Dados 

In [63]:
pjme.to_parquet(data_proc_file)