# Customer personality analysis

In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from datetime import datetime
import warnings
import sys
import os

In [173]:
warnings.filterwarnings('ignore')

In [174]:
pd.set_option('display.max_columns',None)
data = pd.read_csv("marketing_campaign.csv", sep='\t')
data.dropna(inplace=True)
display(data)
print(f'The size of Dataset is {data.shape}')

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,13-06-2013,46,709,43,182,42,118,247,2,9,3,4,5,0,0,0,0,0,0,3,11,0
2236,4001,1946,PhD,Together,64014.0,2,1,10-06-2014,56,406,0,30,0,0,8,7,8,2,5,7,0,0,0,1,0,0,3,11,0
2237,7270,1981,Graduation,Divorced,56981.0,0,0,25-01-2014,91,908,48,217,32,12,24,1,2,3,13,6,0,1,0,0,0,0,3,11,0
2238,8235,1956,Master,Together,69245.0,0,1,24-01-2014,8,428,30,214,80,30,61,2,6,5,10,3,0,0,0,0,0,0,3,11,0


The size of Dataset is (2216, 29)


## Features

### People
 - ID: Customer's unique identifier
 - Year_Birth: Customer's birth year
 - Education: Customer's education level
 - Marital_Status: Customer's marital status
 - Income: Customer's yearly household income
 - Kidhome: Number of children in customer's household
 - Teenhome: Number of teenagers in customer's household
 - Dt_Customer: Date of customer's enrollment with the company
 - Recency: Number of days since customer's last purchase
 - Complain: 1 if the customer complained in the last 2 years, 0 otherwise

### Products
 - MntWines: Amount spent on wine in last 2 years
 - MntFruits: Amount spent on fruits in last 2 years
 - MntMeatProducts: Amount spent on meat in last 2 years
 - MntFishProducts: Amount spent on fish in last 2 years
 - MntSweetProducts: Amount spent on sweets in last 2 years
 - MntGoldProds: Amount spent on gold in last 2 years

### Promotion
 - NumDealsPurchases: Number of purchases made with a discount
 - AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise
 - AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise
 - AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise
 - AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise
 - AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise
 - Response: 1 if customer accepted the offer in the last campaign, 0 otherwise

### Place
 - NumWebPurchases: Number of purchases made through the company’s website
 - NumCatalogPurchases: Number of purchases made using a catalogue
 - NumStorePurchases: Number of purchases made directly in stores
 - NumWebVisitsMonth: Number of visits to company’s website in the last month

In [175]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2216 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2216 non-null   int64  
 1   Year_Birth           2216 non-null   int64  
 2   Education            2216 non-null   object 
 3   Marital_Status       2216 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2216 non-null   int64  
 6   Teenhome             2216 non-null   int64  
 7   Dt_Customer          2216 non-null   object 
 8   Recency              2216 non-null   int64  
 9   MntWines             2216 non-null   int64  
 10  MntFruits            2216 non-null   int64  
 11  MntMeatProducts      2216 non-null   int64  
 12  MntFishProducts      2216 non-null   int64  
 13  MntSweetProducts     2216 non-null   int64  
 14  MntGoldProds         2216 non-null   int64  
 15  NumDealsPurchases    2216 non-null   i

## Feature Engineering

In [176]:
def reduce_memory(df: pd.DataFrame):

    start_time = df.memory_usage().sum()/1024**2
    print(f'Memory Usage(Before): {start_time} MB')

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != 'object':
            col_min = df[col].min()
            col_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if np.iinfo(np.int8).min > col_min  and np.iinfo(np.int8) < col_max:
                    df[col] = df[col].astype(np.int8)
                elif np.iinfo(np.int16).min > col_min and np.iinfo(np.int16) < col_max:
                    df[col] = df[col].astype(np.int16)
                elif np.iinfo(np.int32).min > col_min and np.iinfo(np.int32) < col_max:
                    df[col] = df[col].astype(np.int32)
                elif np.iinfo(np.int64).min > col_min and np.iinfo(np.int64) < col_max:
                    df[col] = df[col].astype(np.int64)
            else:
                if np.finfo(np.float32).min > col_min and np.finfo(np.float32).min < col_max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_time = df.memory_usage().sum()/1024**2
    print(f'Memory Usage(After): {end_time} MB')
    print(f'Decreased by {100*(start_time-end_time)/start_time}')

    return df

In [177]:
data = reduce_memory(data)

Memory Usage(Before): 0.5072021484375 MB
Memory Usage(After): 0.48632049560546875 MB
Decreased by 4.117027677496991


In [178]:
data['Age'] = datetime.now().year - data['Year_Birth']

In [179]:
def years_rel():
    res_year = pd.to_datetime(data['Dt_Customer'], format='%d-%m-%Y').apply(lambda x: x.year)
    cur_year = datetime.now().year
    return cur_year - res_year

data['Years_since_enrollment'] = years_rel()

In [180]:
data["Education"] = data["Education"].replace({"Basic": 0, "Graduation": 1, "2n Cycle": 2, "Master": 2, "PhD": 3})

In [181]:
data['Marital_Status'] = data["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone",
                                                         "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone"})

In [182]:
step_2 = data['Marital_Status'].replace({'Alone': 1, 'Partner': 2})

In [183]:
data["Family_Size"] = step_2 + data['Kidhome'] + data['Teenhome']

In [184]:
data = pd.concat([data, pd.get_dummies(data['Marital_Status'])], axis=1)

In [185]:
mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
data['Sum_Mnt'] = data[mnt_cols].sum(axis=1)

In [186]:
accepted_cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',
                     'Response']
data['Num_Accepted_Cmp'] = data[accepted_cmp_cols].sum(axis=1)

In [187]:
total_purchases = ['NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
data['Num_Total_Purchases'] = data[total_purchases].sum(axis=1)

In [188]:
def remove_outliers(df: pd.DataFrame, col: str) -> pd.Series:
    q3,q1 = np.nanpercentile(df[col],[73,25])
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df = df[((df[col] > lower_bound) & (df[col] < upper_bound))]
    return df

In [189]:
data = remove_outliers(data,'Age')
data = remove_outliers(data, 'Income')

In [190]:
data.drop(['Year_Birth', 'ID', 'Z_CostContact', 'Z_Revenue', 'Dt_Customer', 'Marital_Status'], axis=1, inplace=True)

## EDA

In [191]:
to_corr = ['Age', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'Complain',
          'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
          'MntGoldProds', 'NumDealsPurchases', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3',
          'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'NumWebPurchases', 'NumCatalogPurchases',
          'NumStorePurchases', 'NumWebVisitsMonth', 'Years_Since_Registration', 'Family_Size',
           'Sum_Mnt', 'Num_Accepted_Cmp', 'Num_Total_Purchases', 'Alone', 'Partner']

cmap = sns.diverging_palette(220, 10, as_cmap=True)
matrix = np.triu(data[to_corr].corr())
plt.figure(figsize=(25, 14))
plt.title('Correlation matrix', fontsize=18)
sns.heatmap(data[to_corr].corr(), annot=True,
            fmt='.1f', vmin=-0.4, center=0, cmap=cmap, mask=matrix)

plt.show()

NameError: name 'sns' is not defined

### Data Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

def scaling_func(df: pd.DataFrame) -> None:
    mms = MinMaxScaler()
    return pd.DataFrame(data=mms.fit_transform(df), columns=df.columns)

data_scaled = scaling_func(data)
data_scaled.index = data.index

### Dimentionality Reduction

In [None]:
from sklearn.decomposition import PCA

def dim_reduction(data: pd.DataFrame) -> pd.DataFrame:
    n_components = 8
    pca = PCA(n_components=n_components, random_state=42)
    data = pd.DataFrame(pca.fit_transform(data), columns =([f'PC{i}' for i in range(1, n_components + 1)]))
    return data

data_transformed = dim_reduction(data_scaled)

data_transformed.index =  data_scaled.index

## Cluster Analysis

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

best_model = AgglomerativeClustering(n_clusters=3)
labels = pd.DataFrame(best_model.fit_predict(data_transformed), columns=['Clusters'], index=data.index)
data = pd.concat([data, labels], axis=1)

In [None]:
data

## Conclusion

As a result of cluster analysis, we received three groups of buyers (clusters):
### 0 cluster:
 - Middle income people (average income equals 50000)
 - Average age is 52 years
 - Have an education (Graduation, 2n Cycle, Master, PhD)
 - People without family, people with families with and without children
 - Quite often buy wines, but they also often buy meat
 - Most often make purchases on the web
 - The average number of purchases is 13
### 1 cluster:
 - High income people (average income equals 70000)
 - Average age is 55 years
 - Have an education (Graduation, 2n Cycle, Master, PhD)
 - Have a family with children (Teenhome)
 - Quite often buy wines, but they also often buy meat
 - Most often make purchases in the stores themselves
 - Most often make purchases (compared to other clusters)
### 2 cluster:
 - Average income equals 38000
 - Average age is 49 years
 - Have an education (Graduation, 2n Cycle, Master, PhD)
 - People with families with and without children
 - A low number of purchases and, accordingly, spend little money on purchases