In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import NearestNeighbors, kneighbors_graph
from sklearn.mixture import BayesianGaussianMixture
from itertools import product 
import plotly.express as px

# EDA

Attributes

- People

  - ID: Customer's unique identifier
  - Year_Birth: Customer's birth year
  - Education: Customer's education level
  - Marital_Status: Customer's marital status
  - Income: Customer's yearly household income
  - Kidhome: Number of children in customer's household
  - Teenhome: Number of teenagers in customer's household
  - Dt_Customer: Date of customer's enrollment with the company
  - Recency: Number of days since customer's last purchase
  - Complain: 1 if the customer complained in the last 2 years, 0 otherwise

- Products

  - MntWines: Amount spent on wine in last 2 years
  - MntFruits: Amount spent on fruits in last 2 years
  - MntMeatProducts: Amount spent on meat in last 2 years
  - MntFishProducts: Amount spent on fish in last 2 years
  - MntSweetProducts: Amount spent on sweets in last 2 years
  - MntGoldProds: Amount spent on gold in last 2 years

- Promotion

  - NumDealsPurchases: Number of purchases made with a discount
  - AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
  - AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
  - AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
  - AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
  - AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
  - Response: 1 if customer accepted the offer in the last campaign, 0 otherwise

- Place

  - NumWebPurchases: Number of purchases made through the company’s website
  - NumCatalogPurchases: Number of purchases made using a catalogue
  - NumStorePurchases: Number of purchases made directly in stores
  - NumWebVisitsMonth: Number of visits to company’s website in the last month

In [None]:
df_marketing_campaign = pd.read_csv('datasets/marketing_campaign.csv', sep='\t')

In [None]:
if(pd.options.display.max_columns < df_marketing_campaign.shape[1]):
  pd.options.display.max_columns = df_marketing_campaign.shape[1]

In [None]:
df_marketing_campaign.head()

In [None]:
df_marketing_campaign.info()

In [None]:
df_marketing_campaign.isna().sum()

In [None]:
df_marketing_campaign.isna().melt().pipe(
    lambda df:(
        sns.displot(
            data=df,
            y='variable',
            hue='value',
            multiple='fill',
            height=10
        )
    )
)
plt.show()

In [None]:
pd.options.display.max_columns = df_marketing_campaign.shape[1]
df_marketing_campaign.describe().T


## Numeric columns

In [None]:
df_marketing_campaign.hist(bins=30, figsize=(20, 15))
plt.show()

In [None]:
df_marketing_campaign_num = df_marketing_campaign.select_dtypes(include='number')
n_cols =  3
red_circle_boxen = dict(color='red', marker='o', edgecolors='white')
n_rows = int(np.ceil(((df_marketing_campaign_num.shape[1]-1) / n_cols)))
# # Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(18, (n_rows*6)))
# print(fig)
for i, column in enumerate(df_marketing_campaign_num.columns):
       if(len(df_marketing_campaign_num[column].unique()) <= 10):
              # sns.countplot(
              sns.countplot(
                     df_marketing_campaign_num[[column]], 
                     # x = "kmeans_pca2",
                     x = column,
                     # hue="kmeans_pca2", 
                     # multiple="dodge",
                     # stat = 'probability',
                     # common_norm=False,
                     # binwidth=0.4,
                     # fill=True,
                     # palette='coolwarm',
                     ax=axes[i//n_cols,i%n_cols],
                     # warn_singular=False
              )
       else:
              sns.boxenplot(
                     df_marketing_campaign_num[[column]],
                     ax=axes[i//n_cols,i%n_cols],
                     flier_kws=red_circle_boxen
                     # palette='coolwarm',
                     # warn_singular=False
              )

---

List of numeric columns to analyze:
1. Year_birth (also we considerate it as Datetime object)
2. Income
4. MntMeatProducts
5. MntSweetProducts
6. NumWebPurchases
7. NumCatalogPurchases
8. NumWebVisitsMonth
9. Z_CostContact
10. Z_Revenue

---

## Object columns

In [None]:
df_marketing_campaign.describe(include='object')

Dt_customer is a Datetime object

In [None]:
df_marketing_campaign['Dt_Customer'] = pd.to_datetime(df_marketing_campaign['Dt_Customer'], format='%d-%m-%Y')


In [None]:
df_marketing_campaign.select_dtypes(include='object').value_counts()

df_marketing_campaign_object = df_marketing_campaign.select_dtypes(include='object')
df_marketing_campaign_object_columns = df_marketing_campaign.select_dtypes(include='object').columns

for col in df_marketing_campaign_object_columns:
  print(col)
  print('\n')
  print(df_marketing_campaign_object[col].value_counts())
  df_marketing_campaign_object[col].value_counts().plot(kind='bar')
  plt.show()
  print('\n')


---

To do in object columns:
1. Ordinal encoder in Education column
2. Drop Absurd and YOLO values, group Alone with Single, and apply One Hot Encoder in Marital_Status column

---

## Datetime column

In [None]:
df_marketing_campaign['Dt_Customer'].value_counts().plot(figsize=(15,5))

## Data Cleaning

In [None]:
df_marketing_campaign.describe(include=['object', 'number'])


### Year_Birth Column

We create a new column to determine if makes sense the age that people had at the moment of enrollment with the company. If not, we drop those rows

In [None]:
df_marketing_campaign['Year_Birth'].hist()

In [None]:
df_marketing_campaign['age_customer_enrollment'] = pd.to_datetime(df_marketing_campaign['Dt_Customer'], format='%d-%m-%Y').dt.year - pd.to_datetime(df_marketing_campaign['Year_Birth'], format='%Y').dt.year

df_marketing_campaign['age_customer_enrollment'].hist(bins=50)
plt.show()
print(df_marketing_campaign[df_marketing_campaign['age_customer_enrollment']> 60]['age_customer_enrollment'].value_counts().sort_index())

Drop rows where the age of the customer is more than 73

In [None]:
index_to_drop = df_marketing_campaign[df_marketing_campaign['age_customer_enrollment']> 73].index

df_marketing_campaign.drop(index=index_to_drop, axis=0, inplace=True)
df_marketing_campaign['Year_Birth'].hist()

In [None]:
df_marketing_campaign['Decade_Born'] = ((df_marketing_campaign['Year_Birth']-1900)//10)*10

In [None]:
df_marketing_campaign.drop(columns=['age_customer_enrollment'], inplace=True)
df_marketing_campaign.drop(columns=['Year_Birth'], inplace=True)

### Income Column

First, we drop the null values that we have only found in the Income column, then we analize for outliers and remove if considered

In [None]:
df_marketing_campaign.dropna(inplace=True)

In [None]:
df_marketing_campaign['Income'].describe()

In [None]:
df_marketing_campaign['Income'].sort_values(ascending=False).head(15)

In [None]:
sns.boxenplot(
    df_marketing_campaign['Income'],
    flier_kws=red_circle_boxen
)
plt.show()

In [None]:
index_to_drop = df_marketing_campaign['Income'][df_marketing_campaign['Income'] >= 153000].index
df_marketing_campaign.drop(index=index_to_drop, inplace=True, axis=0)

In [None]:
sns.boxenplot(
    df_marketing_campaign['Income'],
    flier_kws=red_circle_boxen
)
plt.show()

### MntMeatProducts Column

Remove outliers

In [None]:
df_marketing_campaign['MntMeatProducts'].describe()

In [None]:
df_marketing_campaign['MntMeatProducts'].sort_values(ascending=False).head(15)

In [None]:
sns.boxenplot(
    df_marketing_campaign['MntMeatProducts'],
    flier_kws=red_circle_boxen
)
plt.show()

In [None]:
index_to_drop = df_marketing_campaign['MntMeatProducts'][df_marketing_campaign['MntMeatProducts'] > 1500].index
df_marketing_campaign.drop(index=index_to_drop, inplace=True, axis=0)

In [None]:
sns.boxenplot(
    df_marketing_campaign['MntMeatProducts'],
    flier_kws=red_circle_boxen
)
plt.show()

### MntSweetProducts Column

In [None]:
df_marketing_campaign['MntSweetProducts'].describe()

In [None]:
df_marketing_campaign['MntSweetProducts'].sort_values(ascending=False).head(15)

In [None]:
sns.boxenplot(
    df_marketing_campaign['MntSweetProducts'],
    flier_kws=red_circle_boxen
)
plt.show()

In [None]:
index_to_drop = df_marketing_campaign['MntSweetProducts'][df_marketing_campaign['MntSweetProducts'] > 200].index
df_marketing_campaign.drop(index=index_to_drop, inplace=True, axis=0)

In [None]:
sns.boxenplot(
    df_marketing_campaign['MntSweetProducts'],
    flier_kws=red_circle_boxen
)
plt.show()

### NumWebPurchases Column

In [None]:
df_marketing_campaign['NumWebPurchases'].describe()

In [None]:
df_marketing_campaign['NumWebPurchases'].sort_values(ascending=False).head(15)

In [None]:
sns.boxenplot(
    df_marketing_campaign['NumWebPurchases'],
    flier_kws=red_circle_boxen
)
plt.show()

In [None]:
index_to_drop = df_marketing_campaign['NumWebPurchases'][df_marketing_campaign['NumWebPurchases'] > 20].index
df_marketing_campaign.drop(index=index_to_drop, inplace=True, axis=0)

In [None]:
sns.boxenplot(
    df_marketing_campaign['NumWebPurchases'],
    flier_kws=red_circle_boxen
)
plt.show()

### NumCatalogPurchases Column

In [None]:
df_marketing_campaign['NumCatalogPurchases'].describe()

In [None]:
df_marketing_campaign['NumCatalogPurchases'].sort_values(ascending=False).head(15)

In [None]:
sns.boxenplot(
    df_marketing_campaign['NumCatalogPurchases'],
    flier_kws=red_circle_boxen
)
plt.show()

In [None]:
df_marketing_campaign['NumCatalogPurchases'].value_counts()

We are not going to drop anything here in the NumCatalogPurchases Column

### NumWebVisitsMonth Column

In [None]:
df_marketing_campaign['NumWebVisitsMonth'].describe()

In [None]:
df_marketing_campaign['NumWebVisitsMonth'].sort_values(ascending=False).head(15)

In [None]:
sns.boxenplot(
    df_marketing_campaign['NumWebVisitsMonth'],
    flier_kws=red_circle_boxen
)
plt.show()

We are not going to drop anything here in the NumWebVisitsMonth Column

### Z_CostContact and Z_Revenue Columns

These columns have constant values and do not add anything valuable

In [None]:
df_marketing_campaign.drop(columns=['Z_CostContact', 'Z_Revenue'], inplace=True)

### Education Column

We are going to group 2n Cycle with Master and then use Ordinal Encoder to assign each category to a different integer. The order will be:

0. Basic
1. Graduation
2. Master
3. PhD

In [None]:
df_marketing_campaign['Education'].value_counts()

In [None]:
df_marketing_campaign['Education'].replace(['2n Cycle'], ['Master'], inplace=True)

In [None]:
categories = [['Basic', 'Graduation', 'Master', 'PhD']]
oe_encoder = OrdinalEncoder(categories= categories)


In [None]:
df_marketing_campaign['Education_int'] = oe_encoder.fit_transform(df_marketing_campaign[['Education']])

In [None]:
df_marketing_campaign[['Education', 'Education_int']].value_counts()

In [None]:
oe_encoder.categories_

### Marital_Status Column

In [None]:
df_marketing_campaign['Marital_Status'].value_counts()

In [None]:
index_to_drop = df_marketing_campaign['Marital_Status'][df_marketing_campaign['Marital_Status'].isin(['YOLO', 'Absurd'])].index

df_marketing_campaign.drop(index=index_to_drop, axis=0, inplace=True)


In [None]:
df_marketing_campaign['Marital_Status'].value_counts()

In [None]:
df_marketing_campaign['Marital_Status'].replace(['Alone'], ['Single'], inplace=True)
df_marketing_campaign['Marital_Status'].value_counts()

In [None]:
df_marketing_campaign_before_encoders = df_marketing_campaign.copy()

In [None]:
# categories = [['Single', 'Together', 'Married', 'Divorced', 'Widow']]
# oe_encoder = OrdinalEncoder(categories= categories)
# df_marketing_campaign['Marital_Status_int'] = oe_encoder.fit_transform(df_marketing_campaign[['Marital_Status']])


In [None]:
mapping = {
    "Married": 0,
    "Together": 0,
    "Single": 1,
    "Divorced": 1,
    "Widow": 1    
}
df_marketing_campaign['WithPartner'] = df_marketing_campaign['Marital_Status'] .map(mapping)
df_marketing_campaign['WithPartner']

In [None]:
# ohe_encoder = OneHotEncoder(sparse_output=False)
# results_ohe = ohe_encoder.fit_transform(df_marketing_campaign[['Marital_Status']])
# df_ohe = pd.DataFrame(results_ohe, columns=ohe_encoder.get_feature_names_out(), index=df_marketing_campaign.index)
# # df_ohe
# df_marketing_campaign = pd.concat([df_marketing_campaign,df_ohe], axis=1)
# columns_to_check = ohe_encoder.get_feature_names_out().tolist()
# columns_to_check.append('Marital_Status')
# df_marketing_campaign[columns_to_check]

In [None]:
# ohe_encoder = OneHotEncoder(sparse_output=False)
# results_ohe = ohe_encoder.fit_transform(df_marketing_campaign[['Marital_Status']])
# df_ohe = pd.DataFrame(results_ohe, columns=ohe_encoder.get_feature_names_out(), index=df_marketing_campaign.index)
# # df_ohe
# df_marketing_campaign = pd.concat([df_marketing_campaign,df_ohe], axis=1)
# columns_to_check = ohe_encoder.get_feature_names_out().tolist()
# columns_to_check.append('Marital_Status')
# df_marketing_campaign[columns_to_check]

In [None]:
# df_marketing_campaign.drop(columns=['Marital_Status', 'Education'])

### Dt_customer Column

We create a new column that contains the difference in days between the first and the last enrollment, and then we drop the Dt_customer column

In [None]:
df_marketing_campaign['Dt_Customer'].value_counts().plot(figsize=(15,5))

In [None]:
df_marketing_campaign['Dt_Customer_diff_days'] = (df_marketing_campaign['Dt_Customer'] - df_marketing_campaign['Dt_Customer'].min()).dt.days

In [None]:
df_marketing_campaign[['Dt_Customer','Dt_Customer_diff_days']]

In [None]:
df_marketing_campaign.drop(columns=['Dt_Customer'], inplace=True)

Let's see how the histograms look now after all the changes

In [None]:
df_marketing_campaign.hist(bins=30, figsize=(20, 15))
plt.show()

## Looking for Duplicates

In [None]:
df_marketing_campaign['ID'].value_counts().sort_values()

In [None]:
df_marketing_campaign[df_marketing_campaign.duplicated()]

## New columns (Feature Engineering)

Sum of all the Mnt columns representing the total amount spent an all products

In [None]:
columns_keep = df_marketing_campaign.columns.str.contains('Mnt')
df_marketing_campaign['MntTotalSpent'] = df_marketing_campaign.loc[:,columns_keep].sum(axis=1)

Sum of all the Purchases columns representing the total number of purchases made through all the different ways

In [None]:
# columns_keep = df_marketing_campaign.columns.str.contains('Num') & df_marketing_campaign.columns.str.contains('Purchases')
columns_keep = df_marketing_campaign.columns.str.contains('Purchases')
df_marketing_campaign['NumTotalPurchases'] = df_marketing_campaign.loc[:,columns_keep].sum(axis=1)

In [None]:
# columns_keep = df_marketing_campaign.columns.str.contains('Num')
# columns_keep = df_marketing_campaign.columns.str.contains('Num') & df_marketing_campaign.columns.str.contains('Purchases')
columns_keep = df_marketing_campaign.columns.str.contains('Purchases')

# df_marketing_campaign.loc[:,columns_keep]
min_value = (df_marketing_campaign['NumTotalPurchases'] - df_marketing_campaign['NumDealsPurchases']).min()
df_marketing_campaign.loc[(df_marketing_campaign['NumTotalPurchases'] - df_marketing_campaign['NumDealsPurchases'] == min_value), columns_keep]

In [None]:
# df_marketing_campaign.loc[(df_marketing_campaign['NumTotalPurchases'] == 0), columns_keep]
df_marketing_campaign.loc[(df_marketing_campaign['NumTotalPurchases'] == 0)]


---

There are customers that have spent on products but never made a purchase. We drop those cases because they don't make sense. Though, in a real case scenario, this detection could be informed to discover the origin of these cases and fix where appropiate in the data collection.

---


In [None]:
index_to_drop = df_marketing_campaign.loc[(df_marketing_campaign['NumTotalPurchases'] == 0)].index
df_marketing_campaign.drop(index=index_to_drop, axis=0, inplace=True)

In [None]:
# df_marketing_campaign['RatioTotalMntPurchases'] = df_marketing_campaign['MntTotal'] / df_marketing_campaign['NumTotalPurchases']
# df_marketing_campaign['RatioMntWines'] = df_marketing_campaign['MntWines'] / df_marketing_campaign['MntTotal']
# df_marketing_campaign['RatioMntFruits'] = df_marketing_campaign['MntFruits'] / df_marketing_campaign['MntTotal']
# df_marketing_campaign['RatioMntMeatProducts'] = df_marketing_campaign['MntMeatProducts'] / df_marketing_campaign['MntTotal']
# df_marketing_campaign['RatioMntFishProducts'] = df_marketing_campaign['MntFishProducts'] / df_marketing_campaign['MntTotal']
# df_marketing_campaign['RatioMntSweetProducts'] = df_marketing_campaign['MntSweetProducts'] / df_marketing_campaign['MntTotal']
# df_marketing_campaign['RatioMntGoldProducts'] = df_marketing_campaign['MntGoldProds'] / df_marketing_campaign['MntTotal']

In [None]:
# df_marketing_campaign.drop(columns=['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds'], inplace=True)

Accepted any campaigns

In [None]:
# columns_keep = df_marketing_campaign.columns.str.contains('Accepted') | df_marketing_campaign.columns.str.contains('Response') & ~df_marketing_campaign.columns.str.contains('Any')
columns_keep = (df_marketing_campaign.columns.str.contains('Accepted') | df_marketing_campaign.columns.str.contains('Response'))

df_marketing_campaign['AcceptedOfferAnyCmp'] = df_marketing_campaign.loc[:,columns_keep].any(axis=1).astype(int)

# columns_keep = (df_marketing_campaign.columns.str.contains('Accepted') | df_marketing_campaign.columns.str.contains('Response')) & ~df_marketing_campaign.columns.str.contains('Any')
# columns_keep = (df_marketing_campaign.columns.str.contains('Accepted') | df_marketing_campaign.columns.str.contains('Response')) & ~df_marketing_campaign.columns.str.contains('Any') & ~df_marketing_campaign.columns.str.contains('Total')
# df_marketing_campaign['AcceptedOfferTotalCmp'] = df_marketing_campaign.loc[:,columns_keep].sum(axis=1)

# columns_keep = df_marketing_campaign.columns.str.contains('Accepted') | df_marketing_campaign.columns.str.contains('Response') 

# df_marketing_campaign.loc[:,columns_keep].sort_values(by=['AcceptedOfferTotalCmp'])

In [None]:
# df_marketing_campaign['AcceptedOfferAnyCmp'].hist(bins=35)

In [None]:
# df_marketing_campaign['RatioNumWebPurchases'] = df_marketing_campaign['NumWebPurchases'] / df_marketing_campaign['NumTotalPurchases']
# df_marketing_campaign['RatioNumCatalogPurchases'] = df_marketing_campaign['NumCatalogPurchases'] / df_marketing_campaign['NumTotalPurchases']
# df_marketing_campaign['RatioNumStorePurchases'] = df_marketing_campaign['NumStorePurchases'] / df_marketing_campaign['NumTotalPurchases']
# df_marketing_campaign['RatioNumDealsPurchases'] = df_marketing_campaign['NumDealsPurchases'] / df_marketing_campaign['NumTotalPurchases']

In [None]:
# df_marketing_campaign.drop(columns=['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumDealsPurchases'], inplace=True)

In [None]:
# df_marketing_campaign['RatioIncomeMnt'] = (df_marketing_campaign['Income'] * 2) / df_marketing_campaign['MntTotal']

Create Parents column

In [None]:
columns_keep = df_marketing_campaign.columns.str.contains('home')
df_marketing_campaign['IsParent'] = df_marketing_campaign.loc[:,columns_keep].any(axis=1).astype(int)

# df_marketing_campaign.drop(columns=['Kidhome', 'Teenhome'], inplace=True)

Product category most an least spent

In [None]:
columns_keep = [
    'MntWines', 
    'MntFruits', 
    'MntMeatProducts', 
    'MntFishProducts', 
    'MntSweetProducts', 
    'MntGoldProds'
]
is_max = df_marketing_campaign[columns_keep].eq(df_marketing_campaign[columns_keep].max(axis=1), axis=0)
df_marketing_campaign['ProductMostSpent'] = is_max.dot(df_marketing_campaign[columns_keep].columns)
df_marketing_campaign.loc[is_max.sum(axis=1)> 1, ['ProductMostSpent']] = 'MoreThanOne'

is_min = df_marketing_campaign[columns_keep].eq(df_marketing_campaign[columns_keep].min(axis=1), axis=0)
df_marketing_campaign['ProductLeastSpent'] = is_min.dot(df_marketing_campaign[columns_keep].columns)
df_marketing_campaign.loc[is_min.sum(axis=1)> 1, ['ProductLeastSpent']] = 'MoreThanOne'

df_marketing_campaign[[
    'MntWines', 
    'MntFruits', 
    'MntMeatProducts', 
    'MntFishProducts', 
    'MntSweetProducts', 
    'MntGoldProds',
    'ProductLeastSpent'
]]

In [None]:
# Order for Ordinal Encoder
print(df_marketing_campaign[columns_keep].sum().sort_values().index)

In [None]:
categories = [['MoreThanOne', *df_marketing_campaign[columns_keep].sum().sort_values().index]]
oe_encoder = OrdinalEncoder(categories= categories)
df_marketing_campaign['ProductMostSpent_int'] = oe_encoder.fit_transform(df_marketing_campaign[['ProductMostSpent']])
df_marketing_campaign['ProductLeastSpent_int'] = oe_encoder.fit_transform(df_marketing_campaign[['ProductLeastSpent']])


In [None]:
df_marketing_campaign[[
    'MntWines', 
    'MntFruits', 
    'MntMeatProducts', 
    'MntFishProducts', 
    'MntSweetProducts', 
    'MntGoldProds',
    'ProductMostSpent',
    'ProductMostSpent_int',
    'ProductLeastSpent',
    'ProductLeastSpent_int'
]].head(10)

In [None]:
df_save_columns_Mnt = df_marketing_campaign[columns_keep].copy()

df_marketing_campaign.drop(
    columns = [
        'MntWines', 
        'MntFruits', 
        'MntMeatProducts', 
        'MntFishProducts', 
        'MntSweetProducts', 
        'MntGoldProds'
    ],
    inplace = True
)

### Correlation Matrix and Displots

In [None]:
df_marketing_campaign_num = df_marketing_campaign.drop(columns=['ID']).select_dtypes(exclude='object')

In [None]:
# CREATE THE MATRIX
matrix = df_marketing_campaign_num.corr()

# CREATE CMAP
cmap = sns.diverging_palette(250, 15, s=75, l=40,
                          n=9, center="light", as_cmap=True)
# CREATE A MASK
mask = np.triu(np.ones_like(matrix, dtype=bool))

# MAKE FIGSIZE BIGGER
fig, ax = plt.subplots(figsize=(16,12))

# PLOT THE MATRIX
_ = sns.heatmap(matrix, mask=mask, center=0, annot=True,
          fmt='.2f', square=True, cmap=cmap, ax=ax, annot_kws={"fontsize":5})

#### More than 0.5

In [None]:
matrix_2 = matrix.copy()
_ = np.round(matrix_2[(matrix_2 > 0.5)].values, 2)
_[np.tril_indices(_.shape[0], 0)] = np.nan
matrix_2[:] = _
dict_corr_greater_05 = dict(matrix_2.stack().items())

In [None]:
n_cols =  3
n_rows = int(np.ceil((len(dict_corr_greater_05) / n_cols)))
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(17, (n_rows*6)))

for i, ((x, y), value) in enumerate(dict_corr_greater_05.items()):
       sns.scatterplot(
              data=df_marketing_campaign[[x, y]], 
              x=x, 
              y=y,
              ax=axes[i//n_cols,i%n_cols]
       )
       axes[i//n_cols,i%n_cols].legend([value])

#### Less than - 0.5

In [None]:
matrix_2 = matrix.copy()
_ = np.round(matrix_2[(matrix_2 < -0.5)].values, 2)
_[np.tril_indices(_.shape[0], 0)] = np.nan
matrix_2[:] = _
dict_corr_less_minus05 = dict(matrix_2.stack().items())

In [None]:
n_cols =  3
n_rows = int(np.ceil((len(dict_corr_less_minus05) / n_cols)))
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(17, (n_rows*6)))

for i, ((x, y), value) in enumerate(dict_corr_less_minus05.items()):
       sns.scatterplot(
              data=df_marketing_campaign[[x, y]], 
              x=x, 
              y=y,
              ax=axes[i//n_cols,i%n_cols]
       )
       axes[i//n_cols,i%n_cols].legend([value])
       

In [None]:
standard_scaler = StandardScaler()

array_marketing_campaign_num_scaled = standard_scaler.fit_transform(df_marketing_campaign_num)

In [None]:
df_marketing_campaign_num_scaled =  pd.DataFrame(array_marketing_campaign_num_scaled, columns=df_marketing_campaign_num.columns)
df_marketing_campaign_num_scaled.head()

In [None]:
df_marketing_campaign_num_scaled.describe().T

## Different datasets

In [None]:
# list_df_num_scaled = []
# for _ in range(0,500):
#     random_number_columns = np.random.randint(20, len(df_marketing_campaign_num_scaled.columns))
#     df_marketing_campaign_num_scaled.sample(random_number_columns, axis=1)
#     list_df_num_scaled.append(df_marketing_campaign_num_scaled.sample(random_number_columns, axis=1))

# PCA

In [None]:
pca = PCA()
pca.fit(df_marketing_campaign_num_scaled)

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
plt.plot(np.cumsum(np.round(pca.explained_variance_ratio_, 4)*100))


In [None]:
array_marketing_campaign_num_scaled_pca = pca.transform(df_marketing_campaign_num_scaled)
df_marketing_campaign_num_scaled_pca2 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:2]
df_marketing_campaign_num_scaled_pca3 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:3]
df_marketing_campaign_num_scaled_pca4 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:4]
df_marketing_campaign_num_scaled_pca5 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:5]

In [None]:
sns.scatterplot(
    df_marketing_campaign_num_scaled_pca2, 
    x=0, 
    y=1,
    alpha=0.4,
    s=40
)

sns.displot(
    df_marketing_campaign_num_scaled_pca2, 
    x=0, 
    y=1
)
plt.plot()

In [None]:
fig_3d = px.scatter_3d(
    df_marketing_campaign_num_scaled_pca3, x=0, y=1, z=2
)

fig_3d.show()

# Clustering

## K-Means

### With PCA (2, 3, 4, 5 components)

In [None]:
sum_of_squared_distances_pca3 = []
silhouette_scores_pca3 = []
sum_of_squared_distances_pca2 = []
silhouette_scores_pca2 = []
sum_of_squared_distances_pca4 = []
silhouette_scores_pca4 = []
sum_of_squared_distances_pca5 = []
silhouette_scores_pca5 = []
k = range(2,10)
for _ in k:
  kmeans_model_pca2 = KMeans(n_clusters=_, n_init='auto')
  kmeans_model_pca3 = KMeans(n_clusters=_, n_init='auto')
  kmeans_model_pca4 = KMeans(n_clusters=_, n_init='auto')
  kmeans_model_pca5 = KMeans(n_clusters=_, n_init='auto')
  y_pca2 = kmeans_model_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
  y_pca3 = kmeans_model_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
  y_pca4 = kmeans_model_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
  y_pca5 = kmeans_model_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
  sum_of_squared_distances_pca2.append(kmeans_model_pca2.inertia_)
  sum_of_squared_distances_pca3.append(kmeans_model_pca3.inertia_)
  sum_of_squared_distances_pca4.append(kmeans_model_pca4.inertia_)
  sum_of_squared_distances_pca5.append(kmeans_model_pca5.inertia_)
  silhouette_scores_pca2.append(silhouette_score(df_marketing_campaign_num_scaled_pca2, y_pca2))
  silhouette_scores_pca3.append(silhouette_score(df_marketing_campaign_num_scaled_pca3, y_pca3))
  silhouette_scores_pca4.append(silhouette_score(df_marketing_campaign_num_scaled_pca4, y_pca4))
  silhouette_scores_pca5.append(silhouette_score(df_marketing_campaign_num_scaled_pca5, y_pca5))

## TEST RANDOM COLUMNS

In [None]:
# scores_pca2_cluster3 = []
# scores_pca2_cluster4 = []
# scores_pca3_cluster3 = []
# scores_pca3_cluster4 = []
# for _, df_num_scaled in enumerate(list_df_num_scaled):
#     # if _ > 3:
#     #     break
#     pca = PCA()
#     pca.fit(df_num_scaled)

#     array_marketing_campaign_num_scaled_pca = pca.transform(df_num_scaled)
#     df_num_scaled_pca2 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:2]
#     df_num_scaled_pca3 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:3]
    
#     scores_pca2_cluster3.append(silhouette_score(df_num_scaled_pca2, KMeans(n_clusters=3, n_init='auto').fit_predict(df_num_scaled_pca2)))
#     scores_pca2_cluster4.append(silhouette_score(df_num_scaled_pca2, KMeans(n_clusters=4, n_init='auto').fit_predict(df_num_scaled_pca2)))
#     scores_pca3_cluster3.append(silhouette_score(df_num_scaled_pca3, KMeans(n_clusters=3, n_init='auto').fit_predict(df_num_scaled_pca3)))
#     scores_pca3_cluster4.append(silhouette_score(df_num_scaled_pca3, KMeans(n_clusters=4, n_init='auto').fit_predict(df_num_scaled_pca3)))

In [None]:
# max_val = max(scores_pca2_cluster3)
# df_index = scores_pca2_cluster3.index(max_val)
# print(f'max_val: {max_val}, df_index: {df_index}')

In [None]:
# max_val = max(scores_pca2_cluster4)
# df_index = scores_pca2_cluster4.index(max_val)
# print(f'max_val: {max_val}, df_index: {df_index}')

In [None]:
# max_val = max(scores_pca3_cluster3)
# df_index = scores_pca3_cluster3.index(max_val)
# print(f'max_val: {max_val}, df_index: {df_index}')

In [None]:
# max_val = max(scores_pca3_cluster4)
# df_index = scores_pca3_cluster4.index(max_val)
# print(f'max_val: {max_val}, df_index: {df_index}')

In [None]:
# list_df_num_scaled[94].columns

In [None]:
# list_df_num_scaled[94]['kmeans_pca2'] =  KMeans(n_clusters=3, n_init='auto').fit_predict(list_df_num_scaled[94])

# df_marketing_campaign['kmeans_pca2'] = list_df_num_scaled[94]['kmeans_pca2'] 

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12,5))
# plt.figure(figsize=(8,8))
axes[0].plot(k, sum_of_squared_distances_pca2, 'bx-')
axes[0].set_title('2 PCA')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')

axes[1].plot(k, silhouette_scores_pca2, 'rx-')
axes[1].set_title('2 PCA')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Scores')

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12,5))
# plt.figure(figsize=(8,8))
axes[0].plot(k, sum_of_squared_distances_pca3, 'bx-')
axes[0].set_title('3 PCA')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')

axes[1].plot(k, silhouette_scores_pca3, 'rx-')
axes[1].set_title('3 PCA')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Scores')

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12,5))
# plt.figure(figsize=(8,8))
axes[0].plot(k, sum_of_squared_distances_pca4, 'bx-')
axes[0].set_title('4 PCA')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')

axes[1].plot(k, silhouette_scores_pca4, 'rx-')
axes[1].set_title('4 PCA')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Scores')

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12,5))
# plt.figure(figsize=(8,8))
axes[0].plot(k, sum_of_squared_distances_pca5, 'bx-')
axes[0].set_title('5 PCA')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')

axes[1].plot(k, silhouette_scores_pca5, 'rx-')
axes[1].set_title('5 PCA')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Scores')

plt.show()

In [None]:
kmeans_model_pca2 = KMeans(n_clusters=4, n_init='auto')
y_pca2 = kmeans_model_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
print(silhouette_score(df_marketing_campaign_num_scaled_pca2,y_pca2))

In [None]:
kmeans_model_pca3 = KMeans(n_clusters=4, n_init='auto')
y_pca3 = kmeans_model_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
print(silhouette_score(df_marketing_campaign_num_scaled_pca3,y_pca3))

In [None]:
kmeans_model_pca4 = KMeans(n_clusters=4, n_init='auto')
y_pca4 = kmeans_model_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
print(silhouette_score(df_marketing_campaign_num_scaled_pca4,y_pca4))

In [None]:
kmeans_model_pca5 = KMeans(n_clusters=4, n_init='auto')
y_pca5 = kmeans_model_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
print(silhouette_score(df_marketing_campaign_num_scaled_pca5,y_pca5))

In [None]:
df_marketing_campaign['kmeans_pca2'] = y_pca2

In [None]:
print('kmeans_pca2')
print('\n')
print(df_marketing_campaign['kmeans_pca2'].value_counts())
df_marketing_campaign['kmeans_pca2'].value_counts().plot(kind='bar')
plt.show()
print('\n')

In [None]:
fig_2d = px.scatter(
    df_marketing_campaign_num_scaled_pca2, x=0, y=1,
    color=kmeans_model_pca2.labels_, labels={'color': 'class'}
)

fig_2d.show()

In [None]:
sns.scatterplot(
    df_marketing_campaign_num_scaled_pca2, 
    x=0, 
    y=1,
    hue=kmeans_model_pca2.labels_,
    alpha=0.4,
    s=40
)

### Without PCA

In [None]:
sum_of_squared_distances = []
silhouette_scores = []
k = range(2,15)
for _ in k:
  kmeans_model = KMeans(n_clusters=_, n_init='auto')
  y = kmeans_model.fit_predict(df_marketing_campaign_num_scaled)
  sum_of_squared_distances.append(kmeans_model.inertia_)
  silhouette_scores.append(silhouette_score(df_marketing_campaign_num_scaled, y))
  

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12,5))
# plt.figure(figsize=(8,8))
axes[0].plot(k, sum_of_squared_distances, 'bx-')
axes[0].set_title('Without PCA')
axes[0].set_xlabel('K')
axes[0].set_ylabel('Inertia')

axes[1].plot(k, silhouette_scores, 'rx-')
axes[1].set_title('Without PCA')
axes[1].set_xlabel('K')
axes[1].set_ylabel('Silhouette Scores')

plt.show()

In [None]:
kmeans_model = KMeans(n_clusters=5, n_init='auto')
y = kmeans_model.fit_predict(df_marketing_campaign_num_scaled)
print(silhouette_score(df_marketing_campaign_num_scaled,y))

In [None]:
df_marketing_campaign['kmeans'] = y

In [None]:
print('kmeans')
print('\n')
print(df_marketing_campaign['kmeans'].value_counts())
df_marketing_campaign['kmeans'].value_counts().plot(kind='bar')
plt.show()
print('\n')

## Agglomerative Clustering

### 2 PCA

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled_pca2, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2_pca2 = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
agglomerative_model_c3_pca2 = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agglomerative_model_c4_pca2 = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
agglomerative_model_c5_pca2 = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
agglomerative_model_c6_pca2 = AgglomerativeClustering(n_clusters=6, metric='euclidean', linkage='ward')
agglomerative_model_c7_pca2 = AgglomerativeClustering(n_clusters=7, metric='euclidean', linkage='ward')
agglomerative_model_c8_pca2 = AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='ward')
agglomerative_model_c9_pca2 = AgglomerativeClustering(n_clusters=9, metric='euclidean', linkage='ward')

y_c2_pca2 = agglomerative_model_c2_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c3_pca2 = agglomerative_model_c3_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c4_pca2 = agglomerative_model_c4_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c5_pca2 = agglomerative_model_c5_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c6_pca2 = agglomerative_model_c6_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c7_pca2 = agglomerative_model_c7_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c8_pca2 = agglomerative_model_c8_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)
y_c9_pca2 = agglomerative_model_c9_pca2.fit_predict(df_marketing_campaign_num_scaled_pca2)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c2_pca2)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c3_pca2)}')
print(f'4 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c4_pca2)}')
print(f'5 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c5_pca2)}')
print(f'6 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c6_pca2)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c7_pca2)}')
print(f'8 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c8_pca2)}')
print(f'9 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca2,y_c9_pca2)}')

df_marketing_campaign['ac_pca2'] = y_c3_pca2

In [None]:
# df_marketing_campaign_num_scaled_pca2['kmeans_pca2'] = df_marketing_campaign['kmeans_pca2']
fig_2d = px.scatter(
    df_marketing_campaign_num_scaled_pca2, x=0, y=1,
    color=agglomerative_model_c3_pca2.labels_, labels={'color': 'class'}
)

fig_2d.show()

In [None]:
sns.scatterplot(
    df_marketing_campaign_num_scaled_pca2, 
    x=0, 
    y=1,
    hue=agglomerative_model_c3_pca2.labels_,
    alpha=0.4,
    s=40
)

In [None]:
print('ac_pca2')
print('\n')
print(df_marketing_campaign['ac_pca2'].value_counts())
df_marketing_campaign['ac_pca2'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### 3 PCA

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled_pca3, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2_pca3 = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
agglomerative_model_c3_pca3 = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agglomerative_model_c4_pca3 = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
agglomerative_model_c5_pca3 = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
agglomerative_model_c6_pca3 = AgglomerativeClustering(n_clusters=6, metric='euclidean', linkage='ward')
agglomerative_model_c7_pca3 = AgglomerativeClustering(n_clusters=7, metric='euclidean', linkage='ward')
agglomerative_model_c8_pca3 = AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='ward')
agglomerative_model_c9_pca3 = AgglomerativeClustering(n_clusters=9, metric='euclidean', linkage='ward')

y_c2_pca3 = agglomerative_model_c2_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c3_pca3 = agglomerative_model_c3_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c4_pca3 = agglomerative_model_c4_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c5_pca3 = agglomerative_model_c5_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c6_pca3 = agglomerative_model_c6_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c7_pca3 = agglomerative_model_c7_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c8_pca3 = agglomerative_model_c8_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)
y_c9_pca3 = agglomerative_model_c9_pca3.fit_predict(df_marketing_campaign_num_scaled_pca3)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c2_pca3)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c3_pca3)}')
print(f'4 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c4_pca3)}')
print(f'5 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c5_pca3)}')
print(f'6 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c6_pca3)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c7_pca3)}')
print(f'8 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c8_pca3)}')
print(f'9 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca3,y_c9_pca3)}')

df_marketing_campaign['ac_pca3'] = y_c4_pca3

In [None]:
print('ac_pca3')
print('\n')
print(df_marketing_campaign['ac_pca3'].value_counts())
df_marketing_campaign['ac_pca3'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### 4 PCA

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled_pca4, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2_pca4 = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
agglomerative_model_c3_pca4 = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agglomerative_model_c4_pca4 = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
agglomerative_model_c5_pca4 = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
agglomerative_model_c6_pca4 = AgglomerativeClustering(n_clusters=6, metric='euclidean', linkage='ward')
agglomerative_model_c7_pca4 = AgglomerativeClustering(n_clusters=7, metric='euclidean', linkage='ward')
agglomerative_model_c8_pca4 = AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='ward')
agglomerative_model_c9_pca4 = AgglomerativeClustering(n_clusters=9, metric='euclidean', linkage='ward')

y_c2_pca4 = agglomerative_model_c2_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c3_pca4 = agglomerative_model_c3_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c4_pca4 = agglomerative_model_c4_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c5_pca4 = agglomerative_model_c5_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c6_pca4 = agglomerative_model_c6_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c7_pca4 = agglomerative_model_c7_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c8_pca4 = agglomerative_model_c8_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)
y_c9_pca4 = agglomerative_model_c9_pca4.fit_predict(df_marketing_campaign_num_scaled_pca4)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c2_pca4)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c3_pca4)}')
print(f'4 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c4_pca4)}')
print(f'5 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c5_pca4)}')
print(f'6 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c6_pca4)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c7_pca4)}')
print(f'8 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c8_pca4)}')
print(f'9 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca4,y_c9_pca4)}')

df_marketing_campaign['ac_pca4'] = y_c4_pca4

In [None]:
print('ac_pca4')
print('\n')
print(df_marketing_campaign['ac_pca4'].value_counts())
df_marketing_campaign['ac_pca4'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### 5 PCA

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled_pca5, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2_pca5 = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
agglomerative_model_c3_pca5 = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agglomerative_model_c4_pca5 = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
agglomerative_model_c5_pca5 = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
agglomerative_model_c6_pca5 = AgglomerativeClustering(n_clusters=6, metric='euclidean', linkage='ward')
agglomerative_model_c7_pca5 = AgglomerativeClustering(n_clusters=7, metric='euclidean', linkage='ward')
agglomerative_model_c8_pca5 = AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='ward')
agglomerative_model_c9_pca5 = AgglomerativeClustering(n_clusters=9, metric='euclidean', linkage='ward')

y_c2_pca5 = agglomerative_model_c2_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c3_pca5 = agglomerative_model_c3_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c4_pca5 = agglomerative_model_c4_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c5_pca5 = agglomerative_model_c5_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c6_pca5 = agglomerative_model_c6_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c7_pca5 = agglomerative_model_c7_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c8_pca5 = agglomerative_model_c8_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)
y_c9_pca5 = agglomerative_model_c9_pca5.fit_predict(df_marketing_campaign_num_scaled_pca5)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c2_pca5)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c3_pca5)}')
print(f'4 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c4_pca5)}')
print(f'5 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c5_pca5)}')
print(f'6 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c6_pca5)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c7_pca5)}')
print(f'8 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c8_pca5)}')
print(f'9 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca5,y_c9_pca5)}')

df_marketing_campaign['ac_pca5'] = y_c4_pca5

In [None]:
print('ac_pca5')
print('\n')
print(df_marketing_campaign['ac_pca5'].value_counts())
df_marketing_campaign['ac_pca5'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### Without PCA

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2 = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward')
agglomerative_model_c3 = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agglomerative_model_c4 = AgglomerativeClustering(n_clusters=4, metric='euclidean', linkage='ward')
agglomerative_model_c5 = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
agglomerative_model_c6 = AgglomerativeClustering(n_clusters=6, metric='euclidean', linkage='ward')
agglomerative_model_c7 = AgglomerativeClustering(n_clusters=7, metric='euclidean', linkage='ward')
agglomerative_model_c8 = AgglomerativeClustering(n_clusters=8, metric='euclidean', linkage='ward')
agglomerative_model_c9 = AgglomerativeClustering(n_clusters=9, metric='euclidean', linkage='ward')

y_c2 = agglomerative_model_c2.fit_predict(df_marketing_campaign_num_scaled)
y_c3 = agglomerative_model_c3.fit_predict(df_marketing_campaign_num_scaled)
y_c4 = agglomerative_model_c4.fit_predict(df_marketing_campaign_num_scaled)
y_c5 = agglomerative_model_c5.fit_predict(df_marketing_campaign_num_scaled)
y_c6 = agglomerative_model_c6.fit_predict(df_marketing_campaign_num_scaled)
y_c7 = agglomerative_model_c7.fit_predict(df_marketing_campaign_num_scaled)
y_c8 = agglomerative_model_c8.fit_predict(df_marketing_campaign_num_scaled)
y_c9 = agglomerative_model_c9.fit_predict(df_marketing_campaign_num_scaled)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c2)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c3)}')
print(f'4 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c4)}')
print(f'5 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c5)}')
print(f'6 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c6)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c7)}')
print(f'8 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c8)}')
print(f'9 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c9)}')

df_marketing_campaign['ac'] = y_c4

In [None]:
print('ac')
print('\n')
print(df_marketing_campaign['ac'].value_counts())
df_marketing_campaign['ac'].value_counts().plot(kind='bar')
plt.show()
print('\n')

## DBSCAN

### 2 PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled_pca2)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled_pca2)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(0.1, 1, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled_pca2)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled_pca2,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=0.2, min_samples=7)
y_dbscan_pca2 = dbscan_model.fit_predict(df_marketing_campaign_num_scaled_pca2)

print(silhouette_score(df_marketing_campaign_num_scaled_pca2, y_dbscan_pca2))
df_marketing_campaign['DBSCAN_pca2'] = y_dbscan_pca2

In [None]:
print('DBSCAN_pca2')
print('\n')
print(df_marketing_campaign['DBSCAN_pca2'].value_counts())
df_marketing_campaign['DBSCAN_pca2'].value_counts().plot(kind='bar')
plt.show()
print('\n')

In [None]:
fig_2d = px.scatter(
    df_marketing_campaign_num_scaled_pca2, x=0, y=1,
    color=y_dbscan_pca2, labels={'color': 'class'}
)

fig_2d.show()

In [None]:
sns.scatterplot(
    df_marketing_campaign_num_scaled_pca2, 
    x=0, 
    y=1,
    hue=y_dbscan_pca2,
    alpha=0.4,
    s=40
)

### 3 PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled_pca3)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled_pca3)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(0.3, 1.3, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled_pca3)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled_pca3,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=1.2, min_samples=4)
y_dbscan_pca3 = dbscan_model.fit_predict(df_marketing_campaign_num_scaled_pca3)

print(silhouette_score(df_marketing_campaign_num_scaled_pca3, y_dbscan_pca3))
df_marketing_campaign['DBSCAN_pca3'] = y_dbscan_pca3

In [None]:
print('DBSCAN_pca3')
print('\n')
print(df_marketing_campaign['DBSCAN_pca3'].value_counts())
df_marketing_campaign['DBSCAN_pca3'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### 4 PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled_pca4)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled_pca4)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(0.5, 1.5, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled_pca4)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled_pca4,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=1.4, min_samples=6)
y_dbscan_pca4 = dbscan_model.fit_predict(df_marketing_campaign_num_scaled_pca4)

print(silhouette_score(df_marketing_campaign_num_scaled_pca4, y_dbscan_pca4))
df_marketing_campaign['DBSCAN_pca4'] = y_dbscan_pca4

In [None]:
print('DBSCAN_pca4')
print('\n')
print(df_marketing_campaign['DBSCAN_pca4'].value_counts())
df_marketing_campaign['DBSCAN_pca4'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### 5 PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled_pca5)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled_pca5)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(1, 2, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled_pca5)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled_pca5,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=1.9, min_samples=3)
y_dbscan_pca5 = dbscan_model.fit_predict(df_marketing_campaign_num_scaled_pca5)

print(silhouette_score(df_marketing_campaign_num_scaled_pca5, y_dbscan_pca5))
df_marketing_campaign['DBSCAN_pca5'] = y_dbscan_pca5

In [None]:
print('DBSCAN_pca5')
print('\n')
print(df_marketing_campaign['DBSCAN_pca5'].value_counts())
df_marketing_campaign['DBSCAN_pca5'].value_counts().plot(kind='bar')
plt.show()
print('\n')

### Without PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(4, 6, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(26,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=5.9, min_samples=6)
y_dbscan = dbscan_model.fit_predict(df_marketing_campaign_num_scaled)

print(silhouette_score(df_marketing_campaign_num_scaled, y_dbscan))
df_marketing_campaign['DBSCAN'] = y_dbscan

In [None]:
print('DBSCAN')
print('\n')
print(df_marketing_campaign['DBSCAN'].value_counts())
df_marketing_campaign['DBSCAN'].value_counts().plot(kind='bar')
plt.show()
print('\n')

## Evaluation

In [None]:
df_marketing_campaign.columns

In [None]:
df_marketing_campaign_num.columns

### K-Means

In [None]:
df_marketing_campaign['kmeans_pca2'].value_counts()

In [None]:
df_aux_displot = df_marketing_campaign[['Income', 'Kidhome', 'Teenhome', 'Recency', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'Decade_Born',
       'Education_int', 'WithPartner', 'Dt_Customer_diff_days',
       'MntTotalSpent', 'NumTotalPurchases', 'AcceptedOfferAnyCmp', 'IsParent',
       'ProductMostSpent_int', 'ProductLeastSpent_int', 'kmeans_pca2']].copy()

In [None]:
n_cols =  3
n_rows = int(np.ceil(((df_aux_displot.shape[1]-1) / n_cols)))
# # Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(18, (n_rows*6)))
# print(fig)
for i, column in enumerate(df_aux_displot.columns):
       if column == 'kmeans_pca2':
              continue
       if(len(df_marketing_campaign[column].unique()) <= 10):
              var_binwidth=0.4
              if column == 'Decade_Born':
                     var_binwidth=4
              # sns.countplot(
              sns.histplot(
                     df_marketing_campaign[[column,'kmeans_pca2']], 
                     # x = "kmeans_pca2",
                     x = column,
                     hue="kmeans_pca2", 
                     multiple="dodge",
                     stat = 'probability',
                     common_norm=False,
                     binwidth=var_binwidth,
                     # fill=True,
                     palette='rocket',
                     ax=axes[i//n_cols,i%n_cols],
                     # warn_singular=False
              )
       else:
              sns.boxenplot(
                     df_marketing_campaign[[column,'kmeans_pca2']], 
                     x = "kmeans_pca2",
                     y = column,
                     hue="kmeans_pca2", 
                     # fill=True,
                     palette='rocket',
                     ax=axes[i//n_cols,i%n_cols],
                     # warn_singular=False
              )

In [None]:
fig_2d = px.scatter(
    df_marketing_campaign_num_scaled_pca2, x=0, y=1,
    color=kmeans_model_pca2.labels_, labels={'color': 'class'}
)

fig_2d.show()

#### Correlations more than 0.5

In [None]:

matrix = df_marketing_campaign_num.corr()
_ = np.round(matrix[(matrix > 0.5)].values, 2)
_[np.tril_indices(_.shape[0], 0)] = np.nan
matrix[:] = _
dict_corr_greater_05 = dict(matrix.stack().items())

n_cols =  3
n_rows = int(np.ceil((len(dict_corr_greater_05) / n_cols)))
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(17, (n_rows*6)))

for i, ((x, y), value) in enumerate(dict_corr_greater_05.items()):
       sns.scatterplot(
              data=df_marketing_campaign[[x, y, 'kmeans_pca2']], 
              x=x, 
              y=y,
              hue='kmeans_pca2',
              palette='rocket',
              ax=axes[i//n_cols,i%n_cols]
       )

#### Correlations less than - 0.5

In [None]:

matrix = df_marketing_campaign_num.corr()
_ = np.round(matrix[(matrix < -0.5)].values, 2)
_[np.tril_indices(_.shape[0], 0)] = np.nan
matrix[:] = _
dict_corr_less_minus05 = dict(matrix.stack().items())

n_cols =  3
n_rows = int(np.ceil((len(dict_corr_less_minus05) / n_cols)))
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(17, (n_rows*6)))

for i, ((x, y), value) in enumerate(dict_corr_less_minus05.items()):
       sns.scatterplot(
              data=df_marketing_campaign[[x, y, 'kmeans_pca2']], 
              x=x, 
              y=y,
              hue='kmeans_pca2',
              palette='rocket',
              ax=axes[i//n_cols,i%n_cols]
       )

### Agglomerative Clustering

In [None]:
df_marketing_campaign['ac_pca2'].value_counts()

In [None]:
df_aux_displot = df_marketing_campaign[['Income', 'Kidhome', 'Teenhome', 'Recency', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'Decade_Born',
       'Education_int', 'WithPartner', 'Dt_Customer_diff_days',
       'MntTotalSpent', 'NumTotalPurchases', 'AcceptedOfferAnyCmp', 'IsParent',
       'ProductMostSpent_int', 'ProductLeastSpent_int', 'ac_pca2']].copy()

In [None]:
n_cols =  3
n_rows = int(np.ceil(((df_aux_displot.shape[1]-1) / n_cols)))
# # Create the subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(18, (n_rows*6)))
# print(fig)
for i, column in enumerate(df_aux_displot.columns):
       if column == 'ac_pca2':
              continue
       if(len(df_marketing_campaign[column].unique()) <= 10):
              var_binwidth=0.4
              if column == 'Decade_Born':
                     var_binwidth=4
              # sns.countplot(
              sns.histplot(
                     df_marketing_campaign[[column,'ac_pca2']], 
                     # x = "ac_pca2",
                     x = column,
                     hue="ac_pca2", 
                     multiple="dodge",
                     stat = 'probability',
                     common_norm=False,
                     binwidth=var_binwidth,
                     # fill=True,
                     palette='rocket',
                     ax=axes[i//n_cols,i%n_cols],
                     # warn_singular=False
              )
       else:
              sns.boxenplot(
                     df_marketing_campaign[[column,'ac_pca2']], 
                     x = "ac_pca2",
                     y = column,
                     hue="ac_pca2", 
                     # fill=True,
                     palette='rocket',
                     ax=axes[i//n_cols,i%n_cols],
                     # warn_singular=False
              )

In [None]:
fig_2d = px.scatter(
    df_marketing_campaign_num_scaled_pca2, x=0, y=1,
    color=agglomerative_model_c3_pca2.labels_, labels={'color': 'class'}
)

fig_2d.show()

#### Correlations more than 0.5

In [None]:

matrix = df_marketing_campaign_num.corr()
_ = np.round(matrix[(matrix > 0.5)].values, 2)
_[np.tril_indices(_.shape[0], 0)] = np.nan
matrix[:] = _
dict_corr_greater_05 = dict(matrix.stack().items())

n_cols =  3
n_rows = int(np.ceil((len(dict_corr_greater_05) / n_cols)))
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(17, (n_rows*6)))

for i, ((x, y), value) in enumerate(dict_corr_greater_05.items()):
       sns.scatterplot(
              data=df_marketing_campaign[[x, y, 'ac_pca2']], 
              x=x, 
              y=y,
              hue='ac_pca2',
              palette='rocket',
              ax=axes[i//n_cols,i%n_cols]
       )

#### Correlations less than - 0.5

In [None]:

matrix = df_marketing_campaign_num.corr()
_ = np.round(matrix[(matrix < -0.5)].values, 2)
_[np.tril_indices(_.shape[0], 0)] = np.nan
matrix[:] = _
dict_corr_less_minus05 = dict(matrix.stack().items())

n_cols =  3
n_rows = int(np.ceil((len(dict_corr_less_minus05) / n_cols)))
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols,figsize=(17, (n_rows*6)))

for i, ((x, y), value) in enumerate(dict_corr_less_minus05.items()):
       sns.scatterplot(
              data=df_marketing_campaign[[x, y, 'ac_pca2']], 
              x=x, 
              y=y,
              hue='ac_pca2',
              palette='rocket',
              ax=axes[i//n_cols,i%n_cols]
       )