In [0]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
pyo.init_notebook_mode()

In [0]:
#reading data and converting it to time-series
data=pd.read_csv("/household_power_consumption/household_power_consumption.txt",sep=";", 
                 parse_dates={'date_time' : ['Date', 'Time']},infer_datetime_format=True,index_col='date_time',na_values=['nan','?'])

In [0]:
data.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [0]:
 print("[OUT]:dataset shape:",data.shape)

[OUT]:dataset shape: (2075259, 7)


In [0]:
 print("[OUT]:dataset info:",data.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2075259 entries, 2006-12-16 17:24:00 to 2010-11-26 21:02:00
Data columns (total 7 columns):
Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtypes: float64(7)
memory usage: 126.7 MB
[OUT]:dataset info: None


In [0]:
print("[OUT]: Null value stats before preprocess:\n",data.isnull().sum())

[OUT]: Null value stats before preprocess:
 Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64


In [0]:
#data processing to replace nan values with mean of columns
for col in data.columns:        
  data[col]=data[col].fillna(data[col].mean())


In [0]:
print("[OUT]: Null value stats after preprocess:\n",data.isnull().sum())

[OUT]: Null value stats after preprocess:
 Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64


In [0]:
#class to plot mean of different features per day

class plot_summary():
  
  def Global_active_power(self):
    per_day_mean=data.Global_active_power.resample('D').mean().rename_axis('Date').reset_index(name='Global Active Power Per Day Mean')
    fig=px.line(per_day_mean,x='Date',y='Global Active Power Per Day Mean',title=('Mean of Global Active Power Resampled over Day'))
    fig.show()
    
    per_day_std=data.Global_active_power.resample('D').std().rename_axis('Date').reset_index(name='Global Active Power Per Day Standard Deviation')
    fig=px.line(per_day_std,x='Date',y='Global Active Power Per Day Standard Deviation',title=('Standard Deviation of Global Active Power Resampled over Day'))
    fig.show()
    
  def Reactive_power(self):
    per_day_mean=data.Global_reactive_power.resample('D').mean().rename_axis('Date').reset_index(name='Global Reactive Power Per Day Mean')
    fig=px.line(per_day_mean,x='Date',y='Global Reactive Power Per Day Mean',title=('Mean of Global Reactive Power Resampled over Day'))
    fig.show()
    
  def Voltage(self):
    
    voltage=data.Voltage.resample('D').mean().rename_axis('Date').reset_index(name='Voltage Per Day Mean')
    fig=px.line(voltage,x='Date',y='Voltage Per Day Mean',title=('Mean of Voltage Resampled over Day'))
    fig.show()
    
    voltage_bar=data.Voltage.resample('M').mean().rename_axis('Date').reset_index(name='Voltage Per Month Mean')
    fig=px.bar(voltage_bar,x='Date',y='Voltage Per Month Mean',title=('Mean of Voltage Resampled over Month'))
    fig.show()
    
  def Global_intensity(self):
    intensity=data.Global_intensity.resample('D').mean().rename_axis('Date').reset_index(name='Intensity Per Day Mean')
    fig=px.line(intensity,x='Date',y='Intensity Per Day Mean',title=('Mean of Intensity Resampled over Day'))
    fig.show()
    
    

  def Sub_metering_1(self):
    Sub_met=data.Sub_metering_1.resample('D').mean().rename_axis('Date').reset_index(name='Sub_metering_1 Per Day Mean')
    fig=px.line(Sub_met,x='Date',y='Sub_metering_1 Per Day Mean',title=('Mean of Sub_metering_1 Resampled over Day'))
    fig.show()
    
    
    
  def Sub_metering_2(self):
    
    Sub_met=data.Sub_metering_2.resample('D').mean().rename_axis('Date').reset_index(name='Sub_metering_2 Per Day Mean')
    fig=px.line(Sub_met,x='Date',y='Sub_metering_2 Per Day Mean',title=('Mean of Sub_metering_2 Resampled over Day'))
    fig.show()
    
  def Sub_metering_3(self):
    
    Sub_met=data.Sub_metering_3.resample('D').mean().rename_axis('Date').reset_index(name='Sub_metering_3 Per Day Mean')
    fig=px.line(Sub_met,x='Date',y='Sub_metering_3 Per Day Mean',title=('Mean of Sub_metering_3 Resampled over Day'))
    fig.show()
    
obj=plot_summary()

    

In [0]:
obj.Global_active_power()

In [0]:
obj.Reactive_power()

Voltage Remains Nearly Constant over month

In [17]:
obj.Voltage()

In [11]:
obj.Sub_metering_1()
obj.Sub_metering_2()
obj.Sub_metering_3()

In [13]:
#feature correlation

corr_feat=data.corr(method='spearman')
print("[OUT]: Correlation between features without Sampling",corr_feat)

[OUT]: Correlation between features without Sampling                        Global_active_power  ...  Sub_metering_3
Global_active_power               1.000000  ...        0.611548
Global_reactive_power             0.266843  ...        0.074071
Voltage                          -0.327121  ...       -0.187378
Global_intensity                  0.995525  ...        0.608248
Sub_metering_1                    0.324621  ...        0.160727
Sub_metering_2                    0.187453  ...        0.054649
Sub_metering_3                    0.611548  ...        1.000000

[7 rows x 7 columns]


In [16]:
#correlation graph of features

fig=go.Figure(data=go.Heatmap(z=corr_feat,
              x=['Global_active_power','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3'],
             y=['Global_active_power','Global_reactive_power','Voltage','Global_intensity','Sub_metering_1','Sub_metering_2','Sub_metering_3']
             ))
fig.show()

 Global_intensity and Global_active_power are correlated.