In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.neighbors import NearestNeighbors
from itertools import product

# EDA

Attributes

- People

  - ID: Customer's unique identifier
  - Year_Birth: Customer's birth year
  - Education: Customer's education level
  - Marital_Status: Customer's marital status
  - Income: Customer's yearly household income
  - Kidhome: Number of children in customer's household
  - Teenhome: Number of teenagers in customer's household
  - Dt_Customer: Date of customer's enrollment with the company
  - Recency: Number of days since customer's last purchase
  - Complain: 1 if the customer complained in the last 2 years, 0 otherwise

- Products

  - MntWines: Amount spent on wine in last 2 years
  - MntFruits: Amount spent on fruits in last 2 years
  - MntMeatProducts: Amount spent on meat in last 2 years
  - MntFishProducts: Amount spent on fish in last 2 years
  - MntSweetProducts: Amount spent on sweets in last 2 years
  - MntGoldProds: Amount spent on gold in last 2 years

- Promotion

  - NumDealsPurchases: Number of purchases made with a discount
  - AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
  - AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
  - AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
  - AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
  - AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
  - Response: 1 if customer accepted the offer in the last campaign, 0 otherwise

- Place

  - NumWebPurchases: Number of purchases made through the company’s website
  - NumCatalogPurchases: Number of purchases made using a catalogue
  - NumStorePurchases: Number of purchases made directly in stores
  - NumWebVisitsMonth: Number of visits to company’s website in the last month

In [None]:
df_marketing_campaign = pd.read_csv('marketing_campaign.csv', sep='\t')

In [None]:
if(pd.options.display.max_columns < df_marketing_campaign.shape[1]):
  pd.options.display.max_columns = df_marketing_campaign.shape[1]

In [None]:
df_marketing_campaign.head()

In [None]:
df_marketing_campaign.info()

In [None]:
pd.options.display.max_columns = df_marketing_campaign.shape[1]
df_marketing_campaign.describe()


## Numeric columns

In [None]:
df_marketing_campaign.hist(bins=30, figsize=(20, 15))
plt.show()

In [None]:
red_circle = dict(markerfacecolor='red', marker='o', markeredgecolor='white')
df_marketing_campaign_num = df_marketing_campaign.select_dtypes(include='number')
fig, axs = plt.subplots(1, len(df_marketing_campaign_num.columns), figsize=(85,5))
print(axs.flat)
for i, ax in enumerate(axs.flat):
  ax.boxplot(df_marketing_campaign_num.iloc[:,i], flierprops=red_circle)
  ax.set_title(df_marketing_campaign_num.columns[i], fontsize=20, fontweight='bold')
  ax.tick_params(axis='y', labelsize=14)
  
plt.tight_layout()



---


List of numeric columns to observe:
1. Year_birth (also we considerate it as Datetime object)
2. Income
4. MntMeatProducts
5. MntSweetProducts
6. NumWebPurchases
7. NumCatalogPurchases
8. NumWebVisitsMonth
9. Z_CostContact
10. Z_Revenue

---

## Object columns

In [None]:
df_marketing_campaign.describe(include='object')

Dt_customer is a Datetime object

In [None]:
df_marketing_campaign['Dt_Customer'] = pd.to_datetime(df_marketing_campaign['Dt_Customer'], format='%d-%m-%Y')


In [None]:
df_marketing_campaign.select_dtypes(include='object').value_counts()

# red_circle = dict(markerfacecolor='red', marker='o', markeredgecolor='white')
df_marketing_campaign_object = df_marketing_campaign.select_dtypes(include='object')
df_marketing_campaign_object_columns = df_marketing_campaign.select_dtypes(include='object').columns

for col in df_marketing_campaign_object_columns:
  print(col)
  print('\n')
  print(df_marketing_campaign_object[col].value_counts())
  df_marketing_campaign_object[col].value_counts().plot(kind='bar')
  plt.show()
  print('\n')


In [None]:
df_marketing_campaign['Dt_Customer'].value_counts().plot(figsize=(15,5))

## Z_CostContact and Z_Revenue are constant. We drop them

In [None]:
df_marketing_campaign.drop(columns=['Z_CostContact', 'Z_Revenue'], inplace=True)

In [None]:
pd.set_option('display.max_columns', None)
df_marketing_campaign.describe(include='all')


In [None]:
df_marketing_campaign['Year_Birth'].hist()

In [None]:
df_marketing_campaign['age_customer_enrollment'] = pd.to_datetime(df_marketing_campaign['Dt_Customer'], format='%d-%m-%Y').dt.year - pd.to_datetime(df_marketing_campaign['Year_Birth'], format='%Y').dt.year

df_marketing_campaign['age_customer_enrollment'].hist(bins=50)
plt.show()
print(df_marketing_campaign[df_marketing_campaign['age_customer_enrollment']> 60]['age_customer_enrollment'].value_counts().sort_index())

## Drop rows where the age of the customer is more than 73

In [None]:
index_to_drop = df_marketing_campaign[df_marketing_campaign['age_customer_enrollment']> 73].index

df_marketing_campaign.drop(index=index_to_drop, axis=0, inplace=True)
df_marketing_campaign['Year_Birth'].hist()

In [None]:
df_marketing_campaign.hist(bins=30, figsize=(20, 15))
plt.show()

## Dealing with NULL values

In [None]:
df_marketing_campaign.isna().sum()

In [None]:
df_marketing_campaign.loc[(df_marketing_campaign['Income'].isna() == True),['Income']].count()

In [None]:
df_marketing_campaign.loc[(df_marketing_campaign['Income'].isna() == True)]

In [None]:
df_marketing_campaign.dropna(inplace=True)

In [None]:
df_marketing_campaign.describe()

In [None]:
df_marketing_campaign[df_marketing_campaign.duplicated()]

In [None]:
int_cols = df_marketing_campaign.select_dtypes(exclude='object').columns

for _ in int_cols:
  plt.figure(figsize=(10,10))
  sns.boxplot(data=df_marketing_campaign.reset_index(), y=_)
  plt.show()

In [None]:
# CREATE THE MATRIX
matrix = df_marketing_campaign.corr()

# CREATE CMAP
cmap = sns.diverging_palette(250, 15, s=75, l=40,
                          n=9, center="light", as_cmap=True)
# CREATE A MASK
mask = np.triu(np.ones_like(matrix, dtype=bool))

# MAKE FIGSIZE BIGGER
fig, ax = plt.subplots(figsize=(16,12))

# PLOT THE MATRIX
_ = sns.heatmap(matrix, mask=mask, center=0, annot=True,
          fmt='.2f', square=True, cmap=cmap, ax=ax)

In [None]:
df_marketing_campaign['Income'].hist()

In [None]:
df_marketing_campaign_num = df_marketing_campaign.drop(columns=['ID']).select_dtypes(exclude='object')

In [None]:
standard_scaler = StandardScaler()

array_marketing_campaign_num_scaled = standard_scaler.fit_transform(df_marketing_campaign_num)

In [None]:
df_marketing_campaign_num_scaled =  pd.DataFrame(array_marketing_campaign_num_scaled, columns=df_marketing_campaign_num.columns)
df_marketing_campaign_num_scaled.head()

In [None]:
df_marketing_campaign_num_scaled.describe()

## With PCA

In [None]:
pca = PCA()
pca.fit(df_marketing_campaign_num_scaled)

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
plt.plot(np.cumsum(np.round(pca.explained_variance_ratio_, 4)*100))


In [None]:
array_marketing_campaign_num_scaled_pca = pca.transform(df_marketing_campaign_num_scaled)
df_marketing_campaign_num_scaled_pca10 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:11]
df_marketing_campaign_num_scaled_pca15 = pd.DataFrame(array_marketing_campaign_num_scaled_pca).iloc[:,0:16]

In [None]:
sum_of_squared_distances_pca10 = []
silhouette_scores_pca10 = []
sum_of_squared_distances_pca15 = []
silhouette_scores_pca15 = []
k = range(2,15)
for _ in k:
  kmeans_model_pca10 = KMeans(n_clusters=_)
  kmeans_model_pca15 = KMeans(n_clusters=_)
  y_pca10 = kmeans_model_pca10.fit_predict(df_marketing_campaign_num_scaled_pca10)
  y_pca15 = kmeans_model_pca15.fit_predict(df_marketing_campaign_num_scaled_pca15)
  sum_of_squared_distances_pca10.append(kmeans_model_pca10.inertia_)
  sum_of_squared_distances_pca15.append(kmeans_model_pca15.inertia_)
  silhouette_scores_pca10.append(silhouette_score(df_marketing_campaign_num_scaled_pca10, y_pca10))
  silhouette_scores_pca15.append(silhouette_score(df_marketing_campaign_num_scaled_pca15, y_pca15))

In [None]:
plt.figure(figsize=(8,8))
plt.plot(sum_of_squared_distances_pca10, 'bx-')
plt.xlabel('K')
plt.ylabel('Inertia')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.plot(silhouette_scores_pca10, 'rx-')
plt.xlabel('K')
plt.ylabel('Silhouette Scores')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.plot(sum_of_squared_distances_pca15, 'bx-')
plt.xlabel('K')
plt.ylabel('Inertia')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.plot(silhouette_scores_pca15, 'rx-')
plt.xlabel('K')
plt.ylabel('Silhouette Scores')
plt.show()

In [None]:
kmeans_model_pca10 = KMeans(n_clusters=3)
y_pca10 = kmeans_model_pca10.fit_predict(df_marketing_campaign_num_scaled_pca10)
print(silhouette_score(df_marketing_campaign_num_scaled_pca10,y_pca10))

In [None]:
kmeans_model_pca15 = KMeans(n_clusters=3)
y_pca15 = kmeans_model_pca15.fit_predict(df_marketing_campaign_num_scaled_pca15)
print(silhouette_score(df_marketing_campaign_num_scaled_pca15,y_pca15))

In [None]:
df_marketing_campaign['kmeans_pca10'] = y_pca10
df_marketing_campaign['kmeans_pca15'] = y_pca15

## Without PCA

In [None]:
sum_of_squared_distances = []
silhouette_scores = []
k = range(2,15)
for _ in k:
  kmeans_model = KMeans(n_clusters=_)
  y = kmeans_model.fit_predict(df_marketing_campaign_num_scaled)
  sum_of_squared_distances.append(kmeans_model.inertia_)
  silhouette_scores.append(silhouette_score(df_marketing_campaign_num_scaled, y))
  

In [None]:
plt.figure(figsize=(8,8))
plt.plot(sum_of_squared_distances, 'bx-')
plt.xlabel('K')
plt.ylabel('Inertia')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.plot(silhouette_scores, 'rx-')
plt.xlabel('K')
plt.ylabel('Silhouette Scores')
plt.show()

In [None]:
kmeans_model = KMeans(n_clusters=2)
y = kmeans_model.fit_predict(df_marketing_campaign_num_scaled)
print(silhouette_score(df_marketing_campaign_num_scaled,y))

In [None]:
df_marketing_campaign['kmeans'] = y

# Agglomerative Clustering

## PCA 10

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled_pca10, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2_pca10 = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
agglomerative_model_c3_pca10 = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
agglomerative_model_c7_pca10 = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage='ward')

y_c2_pca10 = agglomerative_model_c2_pca10.fit_predict(df_marketing_campaign_num_scaled_pca10)
y_c3_pca10 = agglomerative_model_c3_pca10.fit_predict(df_marketing_campaign_num_scaled_pca10)
y_c7_pca10 = agglomerative_model_c7_pca10.fit_predict(df_marketing_campaign_num_scaled_pca10)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca10,y_c2_pca10)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca10,y_c3_pca10)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca10,y_c7_pca10)}')

df_marketing_campaign['ac_pca10'] = y_c2_pca10

## PCA 15

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled_pca15, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2_pca15 = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')

y_c2_pca15 = agglomerative_model_c2_pca15.fit_predict(df_marketing_campaign_num_scaled_pca15)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled_pca15,y_c2_pca15)}')

df_marketing_campaign['ac_pca15'] = y_c2_pca15

## Without PCA

In [None]:
fig = plt.figure(figsize=(10,10))
dendrogram_plot = dendrogram(linkage(df_marketing_campaign_num_scaled, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Clusters')
plt.ylabel('Euclidean Distance')
plt.show()

In [None]:
agglomerative_model_c2 = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
agglomerative_model_c3 = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
agglomerative_model_c7 = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage='ward')

y_c2 = agglomerative_model_c2.fit_predict(df_marketing_campaign_num_scaled)
y_c3 = agglomerative_model_c3.fit_predict(df_marketing_campaign_num_scaled)
y_c7 = agglomerative_model_c7.fit_predict(df_marketing_campaign_num_scaled)

print(f'2 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c2)}')
print(f'3 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c3)}')
print(f'7 clusters: {silhouette_score(df_marketing_campaign_num_scaled,y_c7)}')

df_marketing_campaign['ac'] = y_c2


# DBSCAN

## 10 PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled_pca10)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled_pca10)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(3.5, 5, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled_pca10)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled_pca10,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=4.4, min_samples=4)
y_dbscan_pca10 = dbscan_model.fit_predict(df_marketing_campaign_num_scaled_pca10)

print(silhouette_score(df_marketing_campaign_num_scaled_pca10, y_dbscan_pca10))
df_marketing_campaign['DBSCAN_pca10'] = y_dbscan_pca10

## 15 PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled_pca15)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled_pca15)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(4.5, 7, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled_pca15)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled_pca15,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_1, fmt='.3f', annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=6.9, min_samples=5)
y_dbscan_pca15 = dbscan_model.fit_predict(df_marketing_campaign_num_scaled_pca15)

print(silhouette_score(df_marketing_campaign_num_scaled_pca15, y_dbscan_pca15))
df_marketing_campaign['DBSCAN_pca15'] = y_dbscan_pca15

## Without PCA

In [None]:
nn_model = NearestNeighbors(n_neighbors=2)
nn_fit = nn_model.fit(df_marketing_campaign_num_scaled)
distances, indices = nn_fit.kneighbors(df_marketing_campaign_num_scaled)

In [None]:
distances = np.sort(distances, axis=0)
distances[:,1]
plt.figure(figsize=(10,10))
plt.plot(distances)
plt.show()

In [None]:
eps_values = np.arange(2.5, 6, 0.1)
min_samples = np.arange(2,10)

In [None]:
dbscan_params = list(product(eps_values,min_samples))

sil_scores = []
n_clusters = []
for p in dbscan_params:
  y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_marketing_campaign_num_scaled)
  sil_scores.append(silhouette_score(df_marketing_campaign_num_scaled,y_pred))
  n_clusters.append(len(np.unique(y_pred)))

In [None]:
df_param_tunning = pd.DataFrame(dbscan_params, columns=['eps','min_samples'])
df_param_tunning['sil_scores'] = sil_scores
df_param_tunning['n_clusters'] = n_clusters

In [None]:
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_scores', columns='eps', index='min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='eps', index='min_samples')

In [None]:
fig, ax = plt.subplots(figsize=(26,6))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(18,6))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()

In [None]:
dbscan_model = DBSCAN(eps=8.6, min_samples=5)
y_dbscan = dbscan_model.fit_predict(df_marketing_campaign_num_scaled)

print(silhouette_score(df_marketing_campaign_num_scaled, y_dbscan))
df_marketing_campaign['DBSCAN'] = y_dbscan

In [None]:
df_marketing_campaign.columns

In [None]:
df_marketing_campaign_num.columns

In [None]:
sns.pairplot(data= df_marketing_campaign[['Year_Birth', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines',
       'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Response', 'DBSCAN_pca10']], hue='DBSCAN_pca10', palette='coolwarm')