In [4]:
# EDA Libs
import pandas as pd
import pingouin as pg
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt

# Machine Learning Libs
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

# Hyperparameter Tuning
import optuna

In [5]:
# Load Data
df_segment = pd.read_csv('./datasets/company_segmentation.csv')

In [6]:
# Visualize Data
df_segment.head(10)

Unnamed: 0,economic_activity,monthly_revenue,number_of_employees,city,age,innovation,customer_segment
0,Commerce,713109.95,12,Rio de Janeiro,6,1,Bronze
1,Commerce,790714.38,9,São Paulo,15,0,Bronze
2,Commerce,1197239.33,17,São Paulo,4,9,Silver
3,Industry,449185.78,15,São Paulo,6,0,Starter
4,Agribusiness,1006373.16,15,São Paulo,15,8,Silver
5,Services,1629562.41,16,Rio de Janeiro,11,4,Silver
6,Services,771179.95,13,Vitória,0,1,Starter
7,Services,707837.61,16,São Paulo,10,6,Silver
8,Commerce,888983.66,17,Belo Horizonte,10,1,Bronze
9,Industry,1098512.64,13,Rio de Janeiro,9,3,Bronze


In [7]:
# Possible Features - Categorical Variables
df_segment['economic_activity'].unique()

array(['Commerce', 'Industry', 'Agribusiness', 'Services'], dtype=object)

In [8]:
# Possible Features - Categorical Variables
df_segment['city'].unique()

array(['Rio de Janeiro', 'São Paulo', 'Vitória', 'Belo Horizonte'],
      dtype=object)

In [9]:
# Possible Features - Categorical Variables
df_segment['customer_segment'].unique()

array(['Bronze', 'Silver', 'Starter', 'Gold'], dtype=object)

In [10]:
# Possible Features - Categorical Variables
df_segment['innovation'].unique()

array([1, 0, 9, 8, 4, 6, 3, 7, 5, 2], dtype=int64)

In [11]:
# Customer Segment (Target) Variable Distribution
target_count = df_segment.value_counts('customer_segment')
target_count

customer_segment
Silver     260
Bronze     202
Starter     22
Gold        16
Name: count, dtype: int64

In [14]:
# Ordered list of Customer Segment
customer_segment_list = ['Starter', 'Bronze', 'Silver', 'Gold']

In [15]:
# Target Distribution - Count
px.bar(target_count, color=target_count.index, category_orders={'customer_segment': customer_segment_list} , title='Customer Segment Distribution')

In [18]:
# Target Distribution - Percentage
percentual_target = target_count / len(df_segment) * 100
px.bar(percentual_target, color=percentual_target.index, category_orders={'customer_segment': customer_segment_list} , title='Customer Segment Distribution (%)')

In [19]:
# City Distribution
percentage_city = df_segment.value_counts('city') / len(df_segment) * 100
px.bar(percentage_city, color=percentage_city.index, title='City Distribution (%)')

In [20]:
# Economic Activity Distribution
percentage_economic_activity = df_segment.value_counts('economic_activity') / len(df_segment) * 100
px.bar(percentage_economic_activity, color=percentage_economic_activity.index, title='Economic Activity Distribution (%)')

In [21]:
# Innovation Distribution
percentage_innovation = df_segment.value_counts('innovation') / len(df_segment) * 100
px.bar(percentage_innovation, color=percentage_innovation.index, title='Innovation Distribution (%)')