In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Chicago', 'Houston']})
#let's load cars.csv too for this demo
df_cars=pd.read_csv("cars.csv")

In [2]:
df_cars.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [3]:
df_cars_categorical=df_cars.select_dtypes(include=['object','category'])

In [4]:
df_cars_categorical.head()

Unnamed: 0,brand,fuel,owner
0,Maruti,Diesel,First Owner
1,Skoda,Diesel,Second Owner
2,Honda,Petrol,Third Owner
3,Hyundai,Diesel,First Owner
4,Maruti,Petrol,First Owner


In [5]:
from sklearn.preprocessing import OneHotEncoder
#using OneHotEncoder class facilitates ML pipelines
#hence better than get_dummies() method of Pandas
#for implementing ML pipelines
#that we will learn in next week's demos

In [7]:
encoder=OneHotEncoder(sparse_output=False,drop='first')
encoder.fit(df_cars_categorical)
#first fit() OHE

In [8]:
#now use OHE to transform your categorical_df
encoded_data=encoder.transform(df_cars_categorical)
#before training of ML models we can perform Train/Test split first
#then use the same "encoder" object to transform Test data
#hence streamlining pre-processing steps for ML model deployment

In [9]:
encoded_data #it's a NumPy array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
#converting it in data frame for better readability
df_cars_categorical_encoded=pd.DataFrame(encoded_data)

In [11]:
df_cars_categorical_encoded.head() #there are no feature names

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
#now you will get feature names too
df_encoded_col_name = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['brand','fuel','owner']))

In [13]:
df_encoded_col_name.head()

Unnamed: 0,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,brand_Force,brand_Ford,brand_Honda,...,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
df

Unnamed: 0,City
0,New York
1,Los Angeles
2,Chicago
3,Houston
4,Chicago
5,Houston


In [16]:
#demo of OHE for just one or list of columns [['col1','col2',...]]
encoder = OneHotEncoder(sparse_output=False, drop='first')  
# drop='first' to avoid dummy variable trap
#sparse=False parameter was deprecated in 
# sklearn.preprocessing.OneHotEncoder 
# starting from Scikit-Learn v1.2.
encoded_data = encoder.fit_transform(df[['City']])

In [17]:
encoded_data

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.]])

In [18]:
# Convert to DataFrame
df_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['City']))
df = df.join(df_encoded)

print(df)

          City  City_Houston  City_Los Angeles  City_New York
0     New York           0.0               0.0            1.0
1  Los Angeles           0.0               1.0            0.0
2      Chicago           0.0               0.0            0.0
3      Houston           1.0               0.0            0.0
4      Chicago           0.0               0.0            0.0
5      Houston           1.0               0.0            0.0
