In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import warnings 
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [2]:
data = pd.read_csv("./data/household_power_consumption.txt", sep=";",parse_dates={'datetime' : ['Date', 'Time']}, infer_datetime_format=True, low_memory=False, na_values=['nan','?'],index_col='datetime')
df = data.sample(n=15000, random_state=42)
# df.drop('index', axis=1, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15000 entries, 2010-07-07 18:10:00 to 2009-07-24 14:04:00
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    14803 non-null  float64
 1   Global_reactive_power  14803 non-null  float64
 2   Voltage                14803 non-null  float64
 3   Global_intensity       14803 non-null  float64
 4   Sub_metering_1         14803 non-null  float64
 5   Sub_metering_2         14803 non-null  float64
 6   Sub_metering_3         14803 non-null  float64
dtypes: float64(7)
memory usage: 937.5 KB


In [4]:
df['time'] = df.index.time
df['date'] = df.index.date

df.sort_values(by=['date','time'])



Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,time,date
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-12-16 17:44:00,5.894,0.000,232.69,25.4,0.0,0.0,16.0,17:44:00,2006-12-16
2006-12-16 18:22:00,2.934,0.000,235.51,12.4,0.0,1.0,17.0,18:22:00,2006-12-16
2006-12-16 20:41:00,3.254,0.074,234.43,13.8,0.0,0.0,17.0,20:41:00,2006-12-16
2006-12-16 22:52:00,0.754,0.086,241.27,4.4,0.0,0.0,0.0,22:52:00,2006-12-16
2006-12-17 00:32:00,2.376,0.056,241.86,9.8,0.0,0.0,0.0,00:32:00,2006-12-17
...,...,...,...,...,...,...,...,...,...
2010-11-25 20:32:00,1.426,0.000,241.32,6.0,0.0,0.0,0.0,20:32:00,2010-11-25
2010-11-26 07:44:00,2.848,0.286,237.52,12.0,0.0,2.0,17.0,07:44:00,2010-11-26
2010-11-26 11:00:00,1.242,0.064,236.71,5.2,0.0,0.0,17.0,11:00:00,2010-11-26
2010-11-26 15:27:00,1.274,0.058,239.85,5.2,0.0,0.0,18.0,15:27:00,2010-11-26


In [7]:
df.drop(['date','time'], axis=1, inplace=True)

for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column] = df[column].fillna(df[column].mean())



In [8]:
eq1 = (df['Global_active_power']*1000/60)
eq2 = df['Sub_metering_1']+df['Sub_metering_2']+df['Sub_metering_3']

df['Power_Consumption'] = eq1 - eq2

df.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Power_Consumption
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-07-07 18:10:00,0.256,0.106,242.0,1.2,0.0,0.0,1.0,3.266667
2007-05-14 06:50:00,0.466,0.352,237.22,2.4,0.0,2.0,0.0,5.766667
2007-09-26 18:10:00,0.758,0.194,238.66,3.2,0.0,1.0,0.0,11.633333
2007-06-19 07:30:00,1.29,0.046,240.64,5.4,1.0,0.0,18.0,2.5
2010-05-10 04:43:00,0.428,0.202,242.23,1.8,0.0,2.0,1.0,4.133333


In [9]:
scaler = StandardScaler()
X = df.drop('Power_Consumption', axis=1)
y = df['Power_Consumption']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)


In [11]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [13]:
print(linreg.intercept_)

9.41148970520408


In [14]:
linreg_pred = linreg.predict(X_test)


In [15]:
print(f"Mean Squared Error:{mean_squared_error(y_test,linreg_pred)}")
print(f"Mean Absolute Error:{mean_absolute_error(y_test,linreg_pred)}")
print(f"RMSE : {np.sqrt(mean_squared_error(y_test,linreg_pred))}")

Mean Squared Error:5.704470341609371e-29
Mean Absolute Error:6.1081555077639724e-15
RMSE : 7.55279441108347e-15


In [16]:
score = r2_score(y_test, linreg_pred)
print(score)

1.0


In [17]:
1 - (1-score) * (len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)

1.0