In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from matplotlib import colors
import seaborn as sns

In [None]:
data = pd.read_csv(r"\marketing_campaign.csv", delimiter="\t")

data

In [None]:
for col in data.columns:
    nulls = data[col].isnull().sum()
    print(f"{col} -> {nulls}")

In [None]:
copy_data = data.copy()

copy_data = copy_data.dropna()

In [None]:
for col in copy_data.columns:
    nulls = copy_data[col].isnull().sum()
    print(f"{col} -> {nulls}")

In [None]:
print(f"lenght of data before removing nulls: {len(data)}")
print(f"lenght of data after removing nulls: {len(copy_data)}")

In [None]:
for col in copy_data.columns:
    uniques = copy_data[col].nunique()
    print(f"{col} -> {uniques}")

In [None]:
copy_data['Education'].unique()

In [None]:
copy_data['Education'] = copy_data['Education'].map({'Graduation' : 'Graduate', 'PhD' : 'PhD' ,'Master' : 'Masters', 'Basic' : 'High_School',  '2n Cycle' : 'Masters'})

In [None]:
copy_data['Marital_Status'].unique()

In [None]:
copy_data['Marital_Status'] = copy_data['Marital_Status'].map({'Single' : 'Single', 'Together' : 'Married', 'Married': 'Married', 'Divorced' : 'Single', 'Widow' : 'Single',  'Alone' : 'Single', 'Absurd': 'Single', 'YOLO' : 'Single'})

In [None]:
copy_data.info()

In [None]:
copy_data['Dt_Customer'] = pd.to_datetime(copy_data['Dt_Customer' ], format="%d-%m-%Y")

In [None]:
dates = []

for i in copy_data['Dt_Customer']:
    i = i.date()
    dates.append(i)

print(f"the oldest recorded customer enrolled: {min(dates)}")
print(f"the newest recorded customer enrolled: {max(dates)}")


In [None]:
customer_for = []

for i in dates:
    delta = (i - max(dates)).days
    customer_for.append(delta)


copy_data['Customer_For'] = customer_for
copy_data['Customer_For'] = pd.to_numeric(copy_data['Customer_For'], errors="coerce")

In [None]:
copy_data['Spent'] = data["MntWines"]+ data["MntFruits"]+ data["MntMeatProducts"]+ data["MntFishProducts"]+ data["MntSweetProducts"]+ data["MntGoldProds"]

copy_data['Total_kids'] = copy_data['Kidhome'] + copy_data['Teenhome']

copy_data['family_size'] = copy_data['Marital_Status'].map({'Single' : 1, 'Married' : 2}) + copy_data['Total_kids']

copy_data['is_parent'] = np.where(copy_data['Total_kids'] > 0, 1, 0)

copy_data=copy_data.rename(columns={"MntWines": "Wines","MntFruits":"Fruits","MntMeatProducts":"Meat","MntFishProducts":"Fish","MntSweetProducts":"Sweets","MntGoldProds":"Gold"})

In [None]:
Age = 2021 - copy_data['Year_Birth']
copy_data['Age'] = Age

In [None]:
to_drop = ["Dt_Customer", "Z_CostContact", "Z_Revenue", "Year_Birth", "ID"]

copy_data = copy_data.drop(to_drop, axis=1)

In [None]:
copy_data

In [None]:
copy_data.describe()

In [None]:
to_plot = [ "Income", "Recency", "Customer_For", "Age", "Spent", "is_parent"]
sns.set(rc={"axes.facecolor":"#FFF9ED","figure.facecolor":"#FFF9ED"})
plt.figure()
sns.pairplot(copy_data[to_plot], hue = "is_parent", palette= (["#682F2F","#F3AB60"]))

plt.show()

In [None]:
to_remove_outs = ['Income', 'Age']

for cols in to_remove_outs:
    lower_bound = np.percentile(copy_data[cols], 1)
    upper_bound = np.percentile(copy_data[cols], 98)

copy_data[cols] = copy_data[cols].clip(lower_bound, upper_bound)

In [None]:
fig,ax = plt.subplots(figsize=(25,10))

corr_mat = copy_data.select_dtypes(include=np.number).corr()
sns.heatmap(corr_mat, cmap='RdYlGn', annot=True, center=0)

plt.show()

In [None]:
cat_columns = copy_data.select_dtypes(exclude=np.number).columns
print(cat_columns)

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')
encoded_data = encoder.fit_transform(copy_data[cat_columns])

one_hot_df = pd.DataFrame(encoded_data.todense(), columns=encoder.get_feature_names_out(cat_columns))

dropped_data = copy_data.copy()
dropped_data.drop(['Education', 'Marital_Status'],axis=1, inplace=True)
final_one_hot = pd.concat([dropped_data, one_hot_df], axis=1)
final_one_hot.dropna(inplace=True)

In [None]:
final_one_hot

In [None]:
one_hot_df.columns.to_list()

In [None]:
from sklearn.preprocessing import StandardScaler

cols_not_to_use = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Response', 'Education_Graduate',
 'Education_High_School',
 'Education_Masters',
 'Education_PhD',
 'Marital_Status_Married',
 'Marital_Status_Single']

scaling_data = final_one_hot.drop(columns = cols_not_to_use)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(scaling_data)

scaled_data = pd.DataFrame(scaled_data, columns = scaling_data.columns)

scaled_data

In [None]:
final_scaled_encoded_Data = pd.concat([scaled_data, one_hot_df], axis=1)

final_scaled_encoded_Data.dropna(inplace=True)

In [None]:
final_scaled_encoded_Data.columns.to_list()

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(final_scaled_encoded_Data)

explained_variance = pca.explained_variance_ratio_

cumulative_variance = explained_variance.cumsum()

plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='b')
plt.title('Cumulative Explained Variance by PCA Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.show()

n_components_95 = next(i for i, cum_var in enumerate(cumulative_variance) if cum_var >= 0.95) + 1
print(f'Number of components explaining 95% variance: {n_components_95}')


In [None]:
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(final_scaled_encoded_Data)

pca_df = pd.DataFrame(reduced_data, columns=['col1', 'col2', 'col3'])

pca_df.describe().T


In [None]:
x = pca_df['col1']
y = pca_df['col2']
z = pca_df['col3']

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x,y,z, c='maroon', marker='o', alpha=0.5)
ax.set_title("A 3D Projection Of Data In The Reduced Dimension")


plt.show()

In [None]:
from sklearn.cluster import KMeans

inertia = []
k_range = range(1,11)
for k in k_range:
   clusters = KMeans(n_clusters=k, random_state=42, n_init=10)
   clusters.fit(pca_df)
   inertia.append(clusters.inertia_)

fig, ax = plt.subplots(figsize=(8,6))
plt.plot(k_range, inertia, marker = 'o', linestyle ='--')
plt.xlabel("clusters k")
plt.ylabel("inertia")

plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=4)
labels = ac.fit_predict(pca_df)

pca_df['clusters'] = labels
copy_data['clusters'] = labels

In [None]:
fig = plt.figure(figsize=(8,6))
ax = plt.subplot(111, projection='3d')
cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"])
ax.scatter(x,y,z, c=pca_df['clusters'], cmap=cmap, marker='o')

plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

linked = linkage(pca_df, method='ward')

plt.figure(figsize=(10, 6))
dendrogram(linked)
plt.title("Dendrogram for Agglomerative Clustering")
plt.xlabel("Data Points")
plt.ylabel("Distance")
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, n_init = 10 , random_state =42)
kmean_cluster = kmeans.fit_predict(pca_df)

pca_df['Kmeans_Cluster'] = kmean_cluster
copy_data['kmeans_Clusters'] = kmean_cluster

In [None]:
fig = plt.figure(figsize=(8,6))
ax = plt.subplot(111, projection='3d')
ax.scatter(x,y,z, c=pca_df['Kmeans_Cluster'], cmap=cmap, marker='o')

plt.show()

In [None]:
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
fig, ax = plt.subplots(figsize=(8,6))
sns.countplot(x=copy_data['kmeans_Clusters'], palette=pal )
plt.xlabel("cluster")
plt.ylabel("count")
plt.title("cluster dirtribution count")

plt.show()

In [None]:
copy_data

In [None]:
sns.scatterplot(x='Spent', y='Income', hue='kmeans_Clusters',data = copy_data,  palette=pal )
plt.xlabel("spent")
plt.ylabel("income")
plt.ylim(0,160000)
plt.title("income vs spent")

plt.show()

In [None]:
sns.boxenplot(x='is_parent', y='Spent', hue='kmeans_Clusters',data = copy_data,  palette=pal )
plt.xlabel("parent")
plt.ylabel("spend")
plt.title("income vs family")

plt.show()

In [None]:
sns.boxenplot(x='is_parent', y='Income', hue='kmeans_Clusters',data = copy_data,  palette=pal )
plt.xlabel("parent")
plt.ylabel("income")
plt.title("income vs family")

plt.show()

In [None]:
copy_data['total_promos'] = copy_data["AcceptedCmp1"]+ copy_data["AcceptedCmp2"]+ copy_data["AcceptedCmp3"]+ copy_data["AcceptedCmp4"]+ copy_data["AcceptedCmp5"]

sns.countplot(x='total_promos', hue='kmeans_Clusters',data = copy_data,  palette=pal )

In [None]:
sns.boxenplot(y="NumDealsPurchases", x='kmeans_Clusters',data = copy_data,  palette=pal )

In [None]:
cols = ["NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases",  "NumWebVisitsMonth"]
for i in cols:
    sns.jointplot(x=copy_data[col],y=copy_data['Spent'], hue=copy_data['kmeans_Clusters'], palette=pal)

In [None]:
copy_data.columns.to_list()

In [None]:
to_check = ['Kidhome','Teenhome','Customer_For','Total_kids','family_size',
'is_parent','Age']

for i in to_check:
    sns.jointplot(x=copy_data[i], y=copy_data['Spent'], hue=copy_data['kmeans_Clusters'], kind='kde', palette=pal)