# UC Irvine - Energy Efficiency EDA

The goal of this project is for me to showcase my EDA skills as well as how insights are drawn on the data.

#### Import data and setup packages

In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from  IPython.display import display

# Additional packages for further analysis 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

""" COPIED FROM UC IRVINE TO FETCH DATA """
from ucimlrepo import fetch_ucirepo 
  
# Fetch dataset 
energy_efficiency = fetch_ucirepo(id=242) 
  
# Data 
X = energy_efficiency.data.features 
y = energy_efficiency.data.targets 
  
# Metadata 
display(energy_efficiency.metadata) 

{'uci_id': 242,
 'name': 'Energy Efficiency',
 'repository_url': 'https://archive.ics.uci.edu/dataset/242/energy+efficiency',
 'data_url': 'https://archive.ics.uci.edu/static/public/242/data.csv',
 'abstract': 'This study looked into assessing the heating load and cooling load requirements of buildings (that is, energy efficiency) as a function of building parameters.',
 'area': 'Computer Science',
 'tasks': ['Classification', 'Regression'],
 'characteristics': ['Multivariate'],
 'num_instances': 768,
 'num_features': 8,
 'feature_types': ['Integer', 'Real'],
 'demographics': [],
 'target_col': ['Y1', 'Y2'],
 'index_col': None,
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 2012,
 'last_updated': 'Mon Feb 26 2024',
 'dataset_doi': '10.24432/C51307',
 'creators': ['Athanasios Tsanas', 'Angeliki Xifara'],
 'intro_paper': {'ID': 379,
  'type': 'NATIVE',
  'title': 'Accurate quantitative estimation of energy performance of residential buildings us

In [None]:
# Variable information 
display(energy_efficiency.variables) 

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,X1,Feature,Continuous,,Relative Compactness,,no
1,X2,Feature,Continuous,,Surface Area,,no
2,X3,Feature,Continuous,,Wall Area,,no
3,X4,Feature,Continuous,,Roof Area,,no
4,X5,Feature,Continuous,,Overall Height,,no
5,X6,Feature,Integer,,Orientation,,no
6,X7,Feature,Continuous,,Glazing Area,,no
7,X8,Feature,Integer,,Glazing Area Distribution,,no
8,Y1,Target,Continuous,,Heating Load,,no
9,Y2,Target,Continuous,,Cooling Load,,no


In [19]:
# Join X and y
df = pd.concat([X, y], axis = 1)
df

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61


#### Basic Information Check 

In [24]:
# Shape and first 5 rows
print(df.shape)
df.head()

(768, 10)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1,Y2
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,20.84,28.28


In [None]:
# Describe data
# transposing this one for better view
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
X1,768.0,0.764167,0.105777,0.62,0.6825,0.75,0.83,0.98
X2,768.0,671.708333,88.086116,514.5,606.375,673.75,741.125,808.5
X3,768.0,318.5,43.626481,245.0,294.0,318.5,343.0,416.5
X4,768.0,176.604167,45.16595,110.25,140.875,183.75,220.5,220.5
X5,768.0,5.25,1.75114,3.5,3.5,5.25,7.0,7.0
X6,768.0,3.5,1.118763,2.0,2.75,3.5,4.25,5.0
X7,768.0,0.234375,0.133221,0.0,0.1,0.25,0.4,0.4
X8,768.0,2.8125,1.55096,0.0,1.75,3.0,4.0,5.0
Y1,768.0,22.307201,10.090196,6.01,12.9925,18.95,31.6675,43.1
Y2,768.0,24.58776,9.513306,10.9,15.62,22.08,33.1325,48.03


In [26]:
# Info on the data
print("Check info and missing values: ")
df.info()
print("Second confirmation on null values: ")
df.isnull().sum()

Check info and missing values: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      768 non-null    float64
 1   X2      768 non-null    float64
 2   X3      768 non-null    float64
 3   X4      768 non-null    float64
 4   X5      768 non-null    float64
 5   X6      768 non-null    int64  
 6   X7      768 non-null    float64
 7   X8      768 non-null    int64  
 8   Y1      768 non-null    float64
 9   Y2      768 non-null    float64
dtypes: float64(8), int64(2)
memory usage: 60.1 KB
Second confirmation on null values: 


X1    0
X2    0
X3    0
X4    0
X5    0
X6    0
X7    0
X8    0
Y1    0
Y2    0
dtype: int64

#### Data Cleaning 

In [None]:
#Check again for missing values
df.isnull().sum()

In [29]:
# Assuming we can rename features using .variables
rename_map = dict(zip(
    energy_efficiency.variables['name'],
    energy_efficiency.variables['description']
))

df_renamed = df.rename(columns=rename_map)
df_renamed


Unnamed: 0,Relative Compactness,Surface Area,Wall Area,Roof Area,Overall Height,Orientation,Glazing Area,Glazing Area Distribution,Heating Load,Cooling Load
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,15.55,21.33
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,15.55,21.33
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,15.55,21.33
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,15.55,21.33
4,0.90,563.5,318.5,122.50,7.0,2,0.0,0,20.84,28.28
...,...,...,...,...,...,...,...,...,...,...
763,0.64,784.0,343.0,220.50,3.5,5,0.4,5,17.88,21.40
764,0.62,808.5,367.5,220.50,3.5,2,0.4,5,16.54,16.88
765,0.62,808.5,367.5,220.50,3.5,3,0.4,5,16.44,17.11
766,0.62,808.5,367.5,220.50,3.5,4,0.4,5,16.48,16.61
