### We're going to use scikit-learn to process categorical variables

In [74]:
# pip install scikit-learn

In [75]:
import pandas as pd

In [76]:
# load the example data with all three kinds of categories
df = pd.read_csv("company_categories.csv")

In [78]:
# Area, Size and Stock_Market are categorical variables
# Company Name is like an identifier/name -column, it's unique
# for every data row, can be ignored
df

Unnamed: 0,Company Name,Sales,Area,Size,Stock_Market
0,Tech Nova,1413453,America,Medium,Yes
1,Global Sphere,115318,Asia,Large,Yes
2,Innovexa,815313,Europe,Micro,No
3,Eco Dynamics,466880,Europe,Medium,No
4,Skyline Solutions,641460,Europe,Small,Yes
5,NextGen Ventures,790976,Asia,Medium,No
6,ApexCorp,831239,America,Medium,Yes
7,Fusion Works,1191840,America,Medium,No
8,Quantum Edge,820746,Europe,Micro,Yes
9,Visionary Labs,863168,America,Large,Yes


### Case 1: Binary categories (category with EXACTLY two options ONLY)

In [79]:
df['Stock_Market'].value_counts()

Stock_Market
Yes    7
No     5
Name: count, dtype: int64

In [80]:
# import the encoder
from sklearn.preprocessing import LabelEncoder

# list all variables that can be binary-converted in your data
variables = ['Stock_Market']

# convert the listed variables
encoder = LabelEncoder()
df[variables] = df[variables].apply(encoder.fit_transform)

In [81]:
df

Unnamed: 0,Company Name,Sales,Area,Size,Stock_Market
0,Tech Nova,1413453,America,Medium,1
1,Global Sphere,115318,Asia,Large,1
2,Innovexa,815313,Europe,Micro,0
3,Eco Dynamics,466880,Europe,Medium,0
4,Skyline Solutions,641460,Europe,Small,1
5,NextGen Ventures,790976,Asia,Medium,0
6,ApexCorp,831239,America,Medium,1
7,Fusion Works,1191840,America,Medium,0
8,Quantum Edge,820746,Europe,Micro,1
9,Visionary Labs,863168,America,Large,1


### Case 2: Ordinal categories

**More than two options, and all the options have an order of magnitude / hierarchy**

In [82]:
df['Size'].value_counts()

Size
Medium    5
Micro     3
Large     2
Small     2
Name: count, dtype: int64

In [83]:
# map the numeric values to each option
category_mapper = {
    "Micro": 0,
    "Small": 1,
    "Medium": 2,
    "Large": 3
}

# convert each value into numeric version using the mapping above
df['Size'] = df['Size'].map(category_mapper)

In [84]:
df

Unnamed: 0,Company Name,Sales,Area,Size,Stock_Market
0,Tech Nova,1413453,America,2,1
1,Global Sphere,115318,Asia,3,1
2,Innovexa,815313,Europe,0,0
3,Eco Dynamics,466880,Europe,2,0
4,Skyline Solutions,641460,Europe,1,1
5,NextGen Ventures,790976,Asia,2,0
6,ApexCorp,831239,America,2,1
7,Fusion Works,1191840,America,2,0
8,Quantum Edge,820746,Europe,0,1
9,Visionary Labs,863168,America,3,1


### Case 3: Nominal categories

**These are usually a beginner's trap => don't treat nominal categories as if they were ordinal categories!**

In [85]:
df['Area'].value_counts()

Area
America    5
Europe     4
Asia       3
Name: count, dtype: int64

In [86]:
# Use OneHotEncoder to create a separate binary variable for each option (yes/no)
from sklearn.preprocessing import OneHotEncoder

# list all nominal categories you wish to convert
variables = ['Area']

# use encoder and create the new binary variables
encoder = OneHotEncoder(sparse_output=False).set_output(transform='pandas')
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df, one_hot_encoded], axis=1).drop(columns=variables)

In [87]:
df

Unnamed: 0,Company Name,Sales,Size,Stock_Market,Area_America,Area_Asia,Area_Europe
0,Tech Nova,1413453,2,1,1,0,0
1,Global Sphere,115318,3,1,0,1,0
2,Innovexa,815313,0,0,0,0,1
3,Eco Dynamics,466880,2,0,0,0,1
4,Skyline Solutions,641460,1,1,0,0,1
5,NextGen Ventures,790976,2,0,0,1,0
6,ApexCorp,831239,2,1,1,0,0
7,Fusion Works,1191840,2,0,1,0,0
8,Quantum Edge,820746,0,1,0,0,1
9,Visionary Labs,863168,3,1,1,0,0


**ALL DONE! we can now get all the correlations!**

In [88]:
df.corr(numeric_only=True)

Unnamed: 0,Sales,Size,Stock_Market,Area_America,Area_Asia,Area_Europe
Sales,1.0,-0.16028,-0.22233,0.337518,-0.190189,-0.178285
Size,-0.16028,1.0,0.243599,0.405999,0.09245,-0.509525
Stock_Market,-0.22233,0.243599,1.0,0.371429,-0.29277,-0.119523
Area_America,0.337518,0.405999,0.371429,1.0,-0.48795,-0.597614
Area_Asia,-0.190189,0.09245,-0.29277,-0.48795,1.0,-0.408248
Area_Europe,-0.178285,-0.509525,-0.119523,-0.597614,-0.408248,1.0
