In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import warnings 
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [2]:
data = pd.read_csv("./data/household_power_consumption.txt", sep=";",parse_dates={'datetime' : ['Date', 'Time']}, infer_datetime_format=True, low_memory=False, na_values=['nan','?'],index_col='datetime')
df = data.sample(n=15000, random_state=42)
# df.drop('index', axis=1, inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15000 entries, 2010-07-07 18:10:00 to 2009-07-24 14:04:00
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Global_active_power    14803 non-null  float64
 1   Global_reactive_power  14803 non-null  float64
 2   Voltage                14803 non-null  float64
 3   Global_intensity       14803 non-null  float64
 4   Sub_metering_1         14803 non-null  float64
 5   Sub_metering_2         14803 non-null  float64
 6   Sub_metering_3         14803 non-null  float64
dtypes: float64(7)
memory usage: 937.5 KB


In [4]:
df.isnull().sum()

Global_active_power      197
Global_reactive_power    197
Voltage                  197
Global_intensity         197
Sub_metering_1           197
Sub_metering_2           197
Sub_metering_3           197
dtype: int64

In [5]:
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column] = df[column].fillna(df[column].mean())

df.isnull().sum()

Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [10]:
df.head() 



Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-07-07 18:10:00,0.256,0.106,242.0,1.2,0.0,0.0,1.0
2007-05-14 06:50:00,0.466,0.352,237.22,2.4,0.0,2.0,0.0
2007-09-26 18:10:00,0.758,0.194,238.66,3.2,0.0,1.0,0.0
2007-06-19 07:30:00,1.29,0.046,240.64,5.4,1.0,0.0,18.0
2010-05-10 04:43:00,0.428,0.202,242.23,1.8,0.0,2.0,1.0


In [11]:
eq1 = (df['Global_active_power']*1000/60)
eq2 = df['Sub_metering_1']+df['Sub_metering_2']+df['Sub_metering_3']

df['Power_Consumption'] = eq1 - eq2

df.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Power_Consumption
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-07-07 18:10:00,0.256,0.106,242.0,1.2,0.0,0.0,1.0,3.266667
2007-05-14 06:50:00,0.466,0.352,237.22,2.4,0.0,2.0,0.0,5.766667
2007-09-26 18:10:00,0.758,0.194,238.66,3.2,0.0,1.0,0.0,11.633333
2007-06-19 07:30:00,1.29,0.046,240.64,5.4,1.0,0.0,18.0,2.5
2010-05-10 04:43:00,0.428,0.202,242.23,1.8,0.0,2.0,1.0,4.133333


In [17]:
scaler = StandardScaler()
X = df.drop('Power_Consumption', axis=1)
y = df['Power_Consumption']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)


In [19]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression()

In [22]:
print(linreg.intercept_)

9.19714997574942


In [25]:
linreg_pred = linreg.predict(X_test)


In [26]:
print(f"Mean Squared Error:{mean_squared_error(y_test,linreg_pred)}")
print(f"Mean Absolute Error:{mean_absolute_error(y_test,linreg_pred)}")
print(f"RMSE : {np.sqrt(mean_squared_error(y_test,linreg_pred))}")

Mean Squared Error:1.6228953173002127e-28
Mean Absolute Error:9.075394052355949e-15
RMSE : 1.2739290864487758e-14


In [27]:
score = r2_score(y_test, linreg_pred)
print(score)

1.0


In [28]:
1 - (1-score) * (len(y_test)-1) / (len(y_test)-X_test.shape[1]-1)

1.0