***Importing the necessary libraries to import the dataset***

In [122]:
import pandas as pd
import numpy as np 
import matplotlib as plt


*importing the dataset*

In [123]:
#Exploring the dataset
df=pd.read_csv("/kaggle/input/powerconsumptiondataset/household_power_consumption.csv")
print(df.head())
print(df.info())
print(df.isna().sum())
print(df.describe())
print(df["Global_active_power"])

   index    Date     Time Global_active_power Global_reactive_power Voltage  \
0      0  1/1/07  0:00:00                2.58                 0.136  241.97   
1      1  1/1/07  0:01:00               2.552                   0.1  241.75   
2      2  1/1/07  0:02:00                2.55                   0.1  241.64   
3      3  1/1/07  0:03:00                2.55                   0.1  241.71   
4      4  1/1/07  0:04:00               2.554                   0.1  241.98   

  Global_intensity Sub_metering_1 Sub_metering_2  Sub_metering_3  
0             10.6              0              0             0.0  
1             10.4              0              0             0.0  
2             10.4              0              0             0.0  
3             10.4              0              0             0.0  
4             10.4              0              0             0.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260640 entries, 0 to 260639
Data columns (total 10 columns):
 #   Column 

***Cleaning and preprocessing the data***

In [124]:
# Drop the index column
if "index" in df.columns:
    df.drop("index", axis=1, inplace=True)

# Check for strange or inconsistent values
print("sub_metering1:", df["Sub_metering_1"].unique())
print("submetering2:", df["Sub_metering_2"].unique())
print("submetering3:", df["Sub_metering_3"].unique())

# Replace '?' with None (or np.nan)
df.loc[df["Sub_metering_1"] == "?", "Sub_metering_1"] = None
df.loc[df["Sub_metering_2"] == "?", "Sub_metering_2"] = None

# Optionally, convert to numeric
df["Sub_metering_1"] = pd.to_numeric(df["Sub_metering_1"], errors='coerce')
df["Sub_metering_2"] = pd.to_numeric(df["Sub_metering_2"], errors='coerce')

# Check null values before dropping
print("sub1 nan values before:", df["Sub_metering_1"].isnull().sum())
print("sub2 nan values before:", df["Sub_metering_2"].isnull().sum())

# Drop rows with nulls in those columns
df.dropna(subset=["Sub_metering_1", "Sub_metering_2"], inplace=True)

# Check null values after dropping
print("sub1 nan values after:", df["Sub_metering_1"].isnull().sum())
print("sub2 nan values after:", df["Sub_metering_2"].isnull().sum())
#PreProcessing the data
df["datetime"] = pd.to_datetime(df["Date"] + " " + df["Time"], format="%d/%m/%Y %H:%M:%S", errors="coerce")



# 2. Extract useful numeric features from datetime
df["hour"] = df["datetime"].dt.hour
df["minute"] = df["datetime"].dt.minute
df["day"] = df["datetime"].dt.day
df["month"] = df["datetime"].dt.month
df["year"] = df["datetime"].dt.year
df.dropna(subset=["hour","minute","day","month","year"],inplace=True)

# 3. (Optional) Drop original Date, Time, and datetime columns if not needed
df.drop(["Date", "Time", "datetime"], axis=1, inplace=True)
#converting the other columns to numeric 
df["Global_active_power"]=pd.to_numeric(df["Global_active_power"],errors="coerce")
df["Global_reactive_power"]=pd.to_numeric(df["Global_reactive_power"],errors="coerce")
df["Voltage"]=pd.to_numeric(df["Voltage"],errors="coerce")
df["Global_intensity"]=pd.to_numeric(df["Global_intensity"],errors="coerce")
#Checking all the columns types after preprocessing
print(df.info())

sub_metering1: ['0' '1' '2' '11' '39' '38' '37' '4' '34' '3' '14' '25' '8' '31' '21' '9'
 '35' '36' '13' '28' '12' '17' '6' '7' '29' '10' '22' '24' '20' '5' '27'
 '23' '30' '18' '19' '16' '33' '40' '26' '64' '71' '58' '56' '72' '32'
 '15' '?' '45' '42' '75' '59' '66' '43' '41' '44' '48' '46' '73' '55' '74'
 '60' '70' '76' '68' '63' '67' '47' '65' '51' '50' '69' '78' '77' '57'
 '62' '49' '61' '53' '52']
submetering2: ['0' '1' '2' '6' '39' '38' '10' '13' '4' '5' '8' '37' '29' '48' '72' '73'
 '74' '42' '36' '24' '28' '40' '41' '30' '22' '23' '27' '25' '34' '17'
 '19' '16' '32' '15' '7' '3' '31' '64' '75' '63' '26' '20' '35' '21' '18'
 '14' '12' '33' '9' '50' '65' '47' '43' '11' '67' '69' '70' '62' '71' '46'
 '58' '57' '60' '66' '61' '?' '44' '78' '77' '76' '53' '56' '51' '49' '68'
 '59' '55' '52' '45' '54']
submetering3: [ 0. 16. 18. 17.  9. 12. 19.  5.  1. 14. 15. 13.  4.  6.  8.  7.  3. 11.
 10.  2. nan 20.]
sub1 nan values before: 3771
sub2 nan values before: 3771
sub1 nan values after

***Splitting the data into training and testing***

In [128]:
###### Defining the X and y variables
#Here we are trying to predict the global active power column 
y=df["Global_active_power"]
X=df.drop("Global_active_power",axis=1)

#print(X)
#print(y)
#Splitting the data into training and testing 
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)


***Fitting a simple linear regression model***

In [134]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))


MAE: 0.028143306283951355
MSE: 0.002013313595849925
R² Score: 0.9985509613869029
