Data encoding is the process of converting categorical variables (non-numeric data) into numerical values so that machine learning algorithms can process them. Most ML models require input data to be numeric because mathematical computations on strings aren’t possible.


In [2]:
""" Implementing One Hot Encoding in Python """

import pandas as pd 
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame({
    'language':['nepali','hindi','english','bangali']
})

# create the object or instance of OneHotEncoder
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['language']]) #fit_transform need 2D array [n_samples, n_features]
encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4 stored elements and shape (4, 4)>

In [3]:
# it produce the output in array
# let's talk about what does it means 
# """
# ['nepali','hindi','english']
# first they are short in alphabetically
# ['english','hindi','nepali']
# now it show in this order
# when use decode it using OneHotEncoder
# english:[1,0,0]
# hindi:[0,1,0]
# nepali:[0,0,1] 
# this means where value is present that value will be 1 and other will be 0
# """
encoded.toarray() 

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

In [4]:
encoder.get_feature_names_out() 

array(['language_bangali', 'language_english', 'language_hindi',
       'language_nepali'], dtype=object)

In [5]:
# let's see the this data in dataframe
import pandas as pd 
en_df = pd.DataFrame(encoded.toarray(),columns=encoder.get_feature_names_out())
en_df

Unnamed: 0,language_bangali,language_english,language_hindi,language_nepali
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0


In [6]:
# this is how column and row we can combine in pandas
x = [[1,2,3,4],[11,12,13,14]]
student = ['Bhaskr','Aayush','Manish','Shiva']
d = pd.DataFrame(x,columns=student)
d

Unnamed: 0,Bhaskr,Aayush,Manish,Shiva
0,1,2,3,4
1,11,12,13,14


In [7]:
# now concat the en_df and df
new_df = pd.concat([df,en_df],axis=1)
new_df

Unnamed: 0,language,language_bangali,language_english,language_hindi,language_nepali
0,nepali,0.0,0.0,0.0,1.0
1,hindi,0.0,0.0,1.0,0.0
2,english,0.0,1.0,0.0,0.0
3,bangali,1.0,0.0,0.0,0.0


In [None]:
# sometime we need to add the new data 
# so while adding the new data is should be from this four language ['nepali','hindi','english','bangali']

In [None]:
# Label encoding involves assigning a unique numerical label to each category in the variable.
from sklearn.preprocessing import LabelEncoder
# create the object of LabelEncoder
lb_encoder = LabelEncoder()
df = pd.DataFrame({
    'language':['nepali','hindi','english','bangali']
})

In [None]:
lb_encoded = lb_encoder.fit_transform(df[['language']])
lb_encoded# it assign the unique value to each value 

  y = column_or_1d(y, warn=True)


array([3, 2, 1, 0])

In [15]:
from sklearn.preprocessing import OrdinalEncoder
orl_encoder = OrdinalEncoder()
df = pd.DataFrame({
    'rank':['first','second','third']
})
orl_encoded = orl_encoder.fit_transform(df[['rank']])
orl_encoded

array([[0.],
       [1.],
       [2.]])

In [19]:
orl_encoder.transform([['first']])



array([[0.]])