In [1]:
import pandas as pd 
import numpy as np 
data = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

In [2]:
energy_df=pd.DataFrame(data)

print("Original with missing values : ")
energy_df.head()

Original with missing values : 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,,400.0
2,Hydropower,2900.0,
3,Geothermal,,150.0
4,Biomass,2500.0,250.0


In [3]:
cleaned_df=energy_df.dropna()
print("Afte the nan values are dropped : ")
cleaned_df.head()

Afte the nan values are dropped : 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
4,Biomass,2500.0,250.0


In [4]:
ec_mean=energy_df["Energy Consumption (MWh)"].mean()
print("Mean of Energy consumtion : ",ec_mean)
cost_mean=energy_df["Cost (Million $)"].mean()
print("Mean of cost : ",cost_mean)

Mean of Energy consumtion :  2450.0
Mean of cost :  250.0


In [15]:
energy_df["Energy Consumption (MWh)"].fillna(ec_mean,inplace=True)
energy_df["Cost (Million $)"].fillna(cost_mean,inplace=True)
energy_df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Energy Consumption (MWh)"].fillna(ec_mean,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  energy_df["Cost (Million $)"].fillna(cost_mean,inplace=True)


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [16]:
data2 = {
    "Energy Source": ["Solar", "Wind", "Hydropower", "Geothermal", "Biomass", "Nuclear"],
    "Energy Consumption (MWh)": [1200, np.nan, 2900, np.nan, 2500, 3200],
    "Cost (Million $)": [200, 400, np.nan, 150, 250, np.nan]
}

In [17]:

fwd_fil_fd=energy_df.fillna(method="ffill")
print("After the data is filled by forward fill")
fwd_fil_fd.head()

After the data is filled by forward fill


  fwd_fil_fd=energy_df.fillna(method="ffill")


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [18]:
new_energy_df=pd.DataFrame(data2)

# Create a flag column indicating missing values in 'Energy Consumption (MWh)'
new_energy_df["Missing Consumption"] = new_energy_df["Energy Consumption (MWh)"].isna().astype(int)
new_energy_df["Missing Cost"] = new_energy_df["Cost (Million $)"].isna().astype(int)

print("\nData with Missing Values Flagged:")
new_energy_df.head()


Data with Missing Values Flagged:


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $),Missing Consumption,Missing Cost
0,Solar,1200.0,200.0,0,0
1,Wind,,400.0,1,0
2,Hydropower,2900.0,,0,1
3,Geothermal,,150.0,1,0
4,Biomass,2500.0,250.0,0,0


In [20]:
arr=[]
for x in energy_df["Energy Consumption (MWh)"] :
    print(x)
    # x1 = (x - energy_df["Energy Consumption (MWh)"].min())/(energy_df["Energy Consumption (MWh)"].min()- energy_df["Energy Consumption (MWh)"].min())
    arr.append(x)
print(arr)

1200.0
2450.0
2900.0
2450.0
2500.0
3200.0
[1200.0, 2450.0, 2900.0, 2450.0, 2500.0, 3200.0]


In [21]:
from sklearn.preprocessing import MinMaxScaler
scalar=MinMaxScaler()
energy_df[["Energy Consumption (MWh)","Cost (Million $)"]]=scalar.fit_transform(
    energy_df[["Energy Consumption (MWh)","Cost (Million $)"]]
)
print("After normalisation : ")
energy_df.head()

After normalisation : 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,0.0,0.2
1,Wind,0.625,1.0
2,Hydropower,0.85,0.4
3,Geothermal,0.625,0.0
4,Biomass,0.65,0.4


In [22]:
from sklearn.preprocessing import StandardScaler

scalar=StandardScaler()
fwd_fil_fd.head()


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,1200.0,200.0
1,Wind,2450.0,400.0
2,Hydropower,2900.0,250.0
3,Geothermal,2450.0,150.0
4,Biomass,2500.0,250.0


In [23]:
fwd_fil_fd[["Energy Consumption (MWh)","Cost (Million $)"]]=scalar.fit_transform(
    fwd_fil_fd[["Energy Consumption (MWh)","Cost (Million $)"]]
)
print("After normalisation : ")
fwd_fil_fd.head()

After normalisation : 


Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,-2.005893,-0.654654
1,Wind,0.0,1.963961
2,Hydropower,0.722121,0.0
3,Geothermal,0.0,-1.309307
4,Biomass,0.080236,0.0


In [24]:
energy_df.head() # min max scaled 

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,0.0,0.2
1,Wind,0.625,1.0
2,Hydropower,0.85,0.4
3,Geothermal,0.625,0.0
4,Biomass,0.65,0.4


In [25]:
fwd_fil_fd.head() # std scaled  scaled 

Unnamed: 0,Energy Source,Energy Consumption (MWh),Cost (Million $)
0,Solar,-2.005893,-0.654654
1,Wind,0.0,1.963961
2,Hydropower,0.722121,0.0
3,Geothermal,0.0,-1.309307
4,Biomass,0.080236,0.0


In [26]:
energy_encoded_df=pd.get_dummies(energy_df,columns=["Energy Source"]) # one hot encoder 
energy_encoded_df.head()

Unnamed: 0,Energy Consumption (MWh),Cost (Million $),Energy Source_Biomass,Energy Source_Geothermal,Energy Source_Hydropower,Energy Source_Nuclear,Energy Source_Solar,Energy Source_Wind
0,0.0,0.2,False,False,False,False,True,False
1,0.625,1.0,False,False,False,False,False,True
2,0.85,0.4,False,False,True,False,False,False
3,0.625,0.0,False,True,False,False,False,False
4,0.65,0.4,True,False,False,False,False,False
