In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
source_folder='/content/drive/My Drive/transactions_dataset.csv'


In [None]:

df = pd.DataFrame(pd.read_csv(source_folder,  sep=';', index_col = None))


In [None]:
df.head()

# Data Preparation

In [None]:
df['date_order'] = pd.to_datetime(df['date_order'])  # Convert the column to datetime format
df['date_invoice'] = pd.to_datetime(df['date_invoice'])

# Get the maximum and minimum dates
max_date = df['date_order'].max()
min_date = df['date_order'].min()

print("Maximum Date:", max_date)
print("Minimum Date:", min_date)

In [None]:
df.info()

In [None]:
df['client_id'] = df['client_id'].astype(str)
df['product_id'] = df['product_id'].astype(str)
df['branch_id'] = df['branch_id'].astype(str)

In [None]:
df = df.sort_values(by='date_order')

In [None]:
df.tail(30)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
pd.DataFrame({"Missing values (%)":round(df.isnull().sum()/len(df), 2)})

# RFM

In [None]:
rfm_m = df.groupby('client_id')['sales_net'].sum()
rfm_m = rfm_m.reset_index()
rfm_m.head()

In [None]:
rfm_f = df.groupby('client_id')["date_order"].count()
rfm_f = rfm_f.reset_index()
rfm_f.columns = ['client_id', 'Frequency']
rfm_f.head()

In [None]:
rfm = pd.merge(rfm_m, rfm_f, on='client_id', how='inner')
rfm.head()

In [None]:
max_date = max(df['date_order'])
max_date

In [None]:
df['Diff'] = max_date - df['date_order']
df.head()

In [None]:
rfm_p = df.groupby('client_id')['Diff'].min()
rfm_p = rfm_p.reset_index()
rfm_p.head()

In [None]:
rfm_p['Diff'] = rfm_p['Diff'].dt.days
rfm_p.head()

In [None]:
rfm = pd.merge(rfm, rfm_p, on='client_id', how='inner')
rfm.columns = ['client_id', 'Amount', 'Frequency', 'Recency']
rfm.head()

### EDA RFM

In [None]:
attributes = ['Amount','Frequency','Recency']
plt.rcParams['figure.figsize'] = [10,8]
sns.boxplot(data = rfm[attributes], orient="v", palette="Set2" ,whis=1.5,saturation=1, width=0.7)
plt.title("Outliers Variable Distribution", fontsize = 14, fontweight = 'bold')
plt.ylabel("Range", fontweight = 'bold')
plt.xlabel("Attributes", fontweight = 'bold')

It can be seen that the Amount Variable has huge outliers, this could led us to think that this costumers are B2B in contrast with the B2C ones that purchase a lower amount

Order Size: B2B transactions typically involve larger order sizes compared to B2C. This is because businesses often purchase in bulk for operational needs, whereas individual consumers typically buy in smaller quantities for personal use.

Transaction Frequency: B2B clients may have more consistent ordering patterns, with contracts or agreements in place that lead to regular large purchases. B2C consumers might make purchases more sporadically and with much smaller amounts.


Building a churn model or conducting any kind of analysis, it would be wise to segment your customer base into B2B and B2C. This allows for more accurate modeling and analysis since the behavior and purchasing patterns are significantly different between these two groups.

In [None]:
# Rescaling the attributes

from sklearn.preprocessing import StandardScaler

rfm_df = rfm[['Amount', 'Frequency', 'Recency']]

# Instantiate
scaler = StandardScaler()

# fit_transform
rfm_df_scaled = scaler.fit_transform(rfm_df)
rfm_df_scaled.shape

In [None]:
rfm_df_scaled = pd.DataFrame(rfm_df_scaled)
rfm_df_scaled.columns = ['Amount', 'Frequency', 'Recency']
rfm_df_scaled.head()


## Model K means for client segmentation

In [None]:

from sklearn.cluster import KMeans

In [None]:

kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_df_scaled)

In [None]:

ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 10, 20, 30]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(rfm_df_scaled)

    ssd.append(kmeans.inertia_)

# plot the SSDs for each n_clusters
plt.plot(ssd)

In [None]:
# Final model with k=2 to see if it can be differenciated between B2B and B2C
kmeans = KMeans(n_clusters=2, max_iter=50)
kmeans.fit(rfm_df_scaled)

In [None]:
rfm['client_labels'] = kmeans.labels_
rfm.head()

In [None]:
rfm['client_labels'].value_counts()

In [None]:
# Box plot to visualize Cluster Id vs Frequency

sns.boxplot(x='client_labels', y='Amount', data=rfm)

In [None]:
#Box plot to visualize Cluster Id vs Frequency

sns.boxplot(x='client_labels', y='Frequency', data=rfm)

In [None]:
# Box plot to visualize Cluster Id vs Recency

sns.boxplot(x='client_labels', y='Recency', data=rfm)

Based on the box plots and assuming that Cluster 0 represents B2C customers and Cluster 1 represents B2B customers, the findings align with typical business patterns where B2B customers have larger but more frequent purchases

## RFM Scores Model

In [None]:
def get_rfm_scores(dataframe) -> pd.core.frame.DataFrame:

    df_ = dataframe.copy()
    df_["recency_score"] = pd.qcut(df_["Recency"], 5, labels=[5, 4, 3, 2, 1])
    df_["frequency_score"] = pd.qcut(
        df_["Frequency"].rank(method="first"), 5, labels=[1, 2, 3, 4, 5]
    )
    df_["monetary_score"] = pd.qcut(df_["Amount"], 5, labels=[1, 2, 3, 4, 5])
    df_["RFM_SCORE"] = df_["recency_score"].astype(str) + df_["frequency_score"].astype(
        str
    )

    return df_


rfm = get_rfm_scores(rfm)


In [None]:
seg_map = {r'[1-2][1-2]': 'hibernating',
           r'[1-2][3-4]': 'at_Risk',
           r'[1-2]5': 'cant_loose',
           r'3[1-2]': 'about_to_sleep',
           r'33': 'need_attention',
           r'[3-4][4-5]': 'loyal_customers',
           r'41': 'promising',
           r'51': 'new_customers',
           r'[4-5][2-3]': 'potential_loyalists',
           r'5[4-5]': 'champions'}

rfm['segment'] = rfm['RFM_SCORE'].replace(seg_map, regex = True)

rfm.head(20)

In [None]:
pip install squarify


### CUSTOMER SEGMENTATION MAP GRAPH

In [None]:
import squarify

segments = rfm["segment"].value_counts().sort_values(ascending=False)
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(26, 10)
squarify.plot(
    sizes=segments,
    label=[label for label in seg_map.values()],
    color=[
        "#AFB6B5",
        "#F0819A",
        "#926717",
        "#F0F081",
        "#81D5F0",
        "#C78BE5",
        "#748E80",
        "#FAAF3A",
        "#7B8FE4",
        "#86E8C0",
    ],
    pad=False,
    bar_kwargs={"alpha": 1},
    text_kwargs={"fontsize": 15},
)
plt.title("Customer Segmentation Map", fontsize=20)
plt.xlabel("Frequency", fontsize=18)
plt.ylabel("Recency", fontsize=18)
plt.show()

In [None]:
rfm[['Recency','Amount','Frequency','segment']]\
.groupby('segment')\
.agg({'mean','std','max','min'})

In [None]:
plt.figure(figsize = (10, 5))
sns.distplot(rfm.Recency)
plt.axvline(rfm.Recency.mean(), c = 'red')
plt.axvline(rfm.Recency.median(), c = 'black')
plt.title('Distribution of Recency',fontweight='bold',fontsize=20)
plt.xlabel('Recency',fontsize=15,color='black')
plt.ylabel('value',fontsize=15,color='black')
plt.show()
print('Mean of recency: ', rfm.Recency.mean())
print('Median of recency: ', rfm.Recency.median())
print('Skewness of recency: ', rfm.Recency.skew())

The churn rate, also known as the rate of attrition or customer churn, is the rate at which customers stop doing business with an entity.
We used the recency column to frame the target variable. If the customer's recency falls above the average value of recency, we consider such customers as churned. The rest of the customers as not churned.
We used the mean of recency as the threshold as the recency is normally or symmetrically distributed.
We will have to impute the target variable to the main dataframe and do the further classification algorithm

Appart from taking into account the client segments

In [None]:
plt.figure(figsize = (18, 8))
ax = sns.countplot(data = rfm,
                   x = 'segment', hue='segment'
                   )
total = len(rfm.segment)
for patch in ax.patches:
    percentage = '{:.1f}%'.format(100 * patch.get_height()/total)
    x = patch.get_x() + patch.get_width() / 2 - 0.17
    y = patch.get_y() + patch.get_height() * 1.005
    ax.annotate(percentage, (x, y), size = 14)
plt.title('Number of Customers by Segments', size = 16)
plt.xlabel('Segment', size = 14)
plt.ylabel('Count', size = 14)
plt.xticks(size = 10)
plt.yticks(size = 10)
plt.show()

In [None]:
plt.figure(figsize=(18, 8))
sns.scatterplot(
    data=rfm, x="Recency", y="Frequency", hue="segment", s=60
)
plt.title("Recency & Frequency by Segments", size=16)
plt.xlabel("Recency", size=12)
plt.ylabel("Frequency", size=12)
plt.xticks(size=10)
plt.yticks(size=10)
plt.legend(loc="best", fontsize=12, title="Segments", title_fontsize=14)
plt.show()

### RFM SEGMENT ANALYSIS

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 8))
fig.suptitle("RFM Segment Analysis", size=14)
feature_list = ["Recency", "Amount", "Frequency"]
for idx, col in enumerate(feature_list):
    sns.boxplot(
        ax=axes[idx], data=rfm, x="segment", y=feature_list[idx], hue='segment'
    )
    axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=60)
    if idx == 1:
        axes[idx].set_ylim([0, 230000])
    if idx == 2:
        axes[idx].set_ylim([0, 1200])
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(16, 12))
fig.suptitle('RFM Segment Analysis', size = 14)
feature_list = ['Recency', 'Amount', 'Frequency']
for idx, col in enumerate(feature_list):
    sns.histplot(ax = axes[idx], data = rfm,
                 hue = 'segment', x = feature_list[idx]
                )
    if idx == 1:
        axes[idx].set_xlim([0, 60000])
    if idx == 2:
        axes[idx].set_xlim([0, 400])
plt.tight_layout()
plt.show()

# Cohort Analysis

In [None]:
# In Cohort Analysis, we track groups of users over time who find some common patterns or behaviors.
import matplotlib.colors as mcolors
from operator import attrgetter
def CohortAnalysis(dataframe):
    dataframe = dataframe[["client_id", "date_order"]].drop_duplicates() # NO SURE IF I NEED THIS
    dataframe["order_month"] = dataframe["date_order"].dt.to_period("M")
    dataframe["cohort"] = (dataframe.groupby("client_id")["date_order"].transform("min").dt.to_period("M"))
    cohort_data = (dataframe.groupby(["cohort", "order_month"]).agg(n_customers=("client_id", "nunique")).reset_index(drop=False))
    cohort_data["period_number"] = (cohort_data.order_month - cohort_data.cohort).apply(attrgetter("n"))
    cohort_pivot = cohort_data.pivot_table(index="cohort", columns="period_number", values="n_customers")
    cohort_size = cohort_pivot.iloc[:, 0]
    retention_matrix = cohort_pivot.divide(cohort_size, axis=0)
    with sns.axes_style("white"):
        fig, ax = plt.subplots(1, 2, figsize=(12, 8), sharey=True, gridspec_kw={"width_ratios": [1, 11]})
        sns.heatmap(retention_matrix,mask=retention_matrix.isnull(), annot=True, cbar=False, fmt=".0%", cmap="inferno", ax=ax[1],)
        ax[1].set_title("Monthly Cohorts: User Retention", fontsize=14)
        ax[1].set(xlabel="# of periods", ylabel="")
        white_cmap = mcolors.ListedColormap(["white"])
        sns.heatmap(pd.DataFrame(cohort_size).rename(columns={0: "cohort_size"}),annot=True,cbar=False,fmt="g",cmap=white_cmap,ax=ax[0],)
        fig.tight_layout()

CohortAnalysis(df)

# Customer LTV

In [None]:
pip install lifetimes


In [None]:
from plotly.tools import FigureFactory as ff
from lifetimes import GammaGammaFitter
from lifetimes.plotting import plot_history_alive
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes import BetaGeoFitter


In [None]:
cltv_df = df.groupby("client_id").agg(
    recency=pd.NamedAgg(column='date_invoice', aggfunc=lambda date_invoice: (date_invoice.max() - date_invoice.min()).days),
    T=pd.NamedAgg(column='date_invoice', aggfunc=lambda date_invoice: (date_invoice.max() - date_invoice.min()).days),
    frequency=pd.NamedAgg(column='date_invoice', aggfunc='nunique'),
    monetary=pd.NamedAgg(column='sales_net', aggfunc='sum')
)

# Monetary: Average earning per purchase (Frequency)

cltv_df["monetary"]  = cltv_df["monetary"] / cltv_df["frequency"]

# Lets filtered the data according to Frequency > 1

cltv_df = cltv_df[cltv_df["frequency"] > 1]

# Recency and T values is in day format. We should convert to the weekly format.

cltv_df["recency"] = cltv_df["recency"] / 7

cltv_df["T"] = cltv_df["T"] / 7

In [None]:
cltv_df.head()

### BetaGeoFitter, BG-NBD Model.

In [None]:
bgf = BetaGeoFitter(penalizer_coef= 0.001)

# Preparation of the Model

bgf.fit(cltv_df["frequency"],
        cltv_df["recency"],
        cltv_df["T"])

Customers  top purchasers in 3 months

In [None]:
cltv_df["expected_purc_3_month"] = bgf.predict(12,
                                                cltv_df["frequency"],
                                                cltv_df["recency"],
                                                cltv_df["T"])

cltv_df.sort_values("expected_purc_3_month", ascending = False)

### Gamma Gamma Model

In [None]:
# Filter out customers with non-positive monetary value, which are errors or returns to be able to execute the GammaGammaFitter
cltv_df = cltv_df[cltv_df["monetary"] > 0]

In [None]:
ggf = GammaGammaFitter(penalizer_coef= 0.01)

ggf.fit(cltv_df["frequency"], cltv_df["monetary"])

# It means, calculate the expected average profit.

cltv_df["expected_average_profit"] = ggf.conditional_expected_average_profit(cltv_df["frequency"], cltv_df["monetary"])

CLTV Calculation with BG/NBD & Gamma Gamma Model

In [None]:
cltv = ggf.customer_lifetime_value(bgf,
                                   cltv_df["frequency"],
                                   cltv_df["recency"],
                                   cltv_df["T"],
                                   cltv_df["monetary"],
                                   time = 3, # 3 Aylık
                                   freq = "W", # T'nin frekans bilgisi
                                   discount_rate= 0.01)

cltv = cltv.reset_index()

cltv_final = cltv_df.merge(cltv, on = "client_id", how = "left")
cltv_final.sort_values(by = "clv", ascending = False).head(10)

In [None]:
# Lets add the segmentation to the dataset according to clv values of the customers.

cltv_final["segment"] = pd.qcut(cltv_final["clv"], 4, labels = ["D", "C", "B", "A"])

cltv_final.groupby("segment").agg({"count", "sum", "mean"})

In [None]:
cltv_final_sort=cltv_final.sort_values(by = "clv", ascending = False).head(10)
cltv_final_sort

In [None]:
# Bar plot for average expected purchase by segment
plt.figure(figsize=(10, 6))
sns.barplot(x='segment', y='expected_purc_3_month', data=cltv_final, estimator=sum)
plt.title('Total Expected Purchases in 3 Months by Segment')
plt.show()

# AVERAGE PURCHASE FREQUENCY

In [None]:

df = df.sort_values(['client_id', 'date_order'])


df['PrevPurchaseDate'] = df.groupby('client_id')['date_order'].shift(1)
df['DaysBetweenPurchases'] = (df['date_order'] - df['PrevPurchaseDate']).dt.days


avg_purchase_frequency = df.groupby('client_id')['DaysBetweenPurchases'].mean().reset_index()


print(avg_purchase_frequency.describe())


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.histplot(avg_purchase_frequency['DaysBetweenPurchases'], bins=30, kde=True)
plt.title('Distribution of Average Purchase Frequency (Days)')
plt.xlabel('Average Days Between Purchases')
plt.ylabel('Number of Customers')
plt.show()


In [None]:
median_recency = avg_purchase_frequency['DaysBetweenPurchases'].median()
percentile_75th_recency = avg_purchase_frequency['DaysBetweenPurchases'].quantile(0.75)
percentile_75th_recency

High Variability, there is a high standard deviation, meaning there's a wide spread in the frequency of purchases among your customers and huge outliers

We'll use the 75th percentile.


# Churn Column

# Split between B2C and B2B Customers

In [None]:

amount_threshold = 10000

# Classify clients based on the calculated threshold
rfm['Client_Type'] = rfm.apply(
    lambda x: 'B2B' if x['Amount'] >= amount_threshold else 'B2C',
    axis=1
)

rfm


In [None]:
# Size of each group.
counts=rfm.groupby("Client_Type").size()
print(counts)
print(counts[0]/len(rfm))
print(counts[1]/len(rfm))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


sns.kdeplot(data=rfm, x="Amount")
plt.xlabel('Amount')
plt.ylabel('Density')
plt.title('Distribution of Amount')
plt.xlim([0, 500000])
plt.show()


In [None]:


sns.histplot(data=rfm, x="Amount")
plt.xlabel('Amount')
plt.ylabel('Count')
plt.title('Histogram of Amount')
plt.xlim([0, 500000])
plt.show()


In [None]:
rfm.head(20)

In [None]:
rfm[((rfm["Client_Type"]=="B2B"))].head(30) # FREQUENCY BASTANTE ALTA POR ESO K MEANS LOS CLASSIFICO COMO B2C Y NO B2B, AROUND 100 DIFERENT CLASSIFIED

In [None]:
# Splitting the DataFrame based on 'client_labels'
rfm_B2C = rfm[rfm['Client_Type'] == "B2C"].copy()
rfm_B2B = rfm[rfm['Client_Type'] == "B2B"].copy()


# CHURN METHOD

In [None]:
def define_churn2(row, threshold_churn):
    """
    Define churn based on the customer segment and the distribution of purchase frequency.

    Parameters:
    - row: A row from the RFM DataFrame.
    - threshold define to consider a customer churned

    Returns:
    - int: 1 if the customer is considered churned, 0 otherwise.
    """
    #segment = row['segment']
    recency = row['Recency']

    # Determine churn based on the recency and the segment threshold
    return 1 if recency > threshold_churn else 0

### CHURN B2B 3 MOTHS + PERCENTILE

In [None]:
threshold_churn = percentile_75th_recency + 90  # Replace with your actual 75th percentile value + 90 days (3moths)

# Apply the function to your data
rfm_B2B['Churn'] = rfm_B2B.apply(define_churn2, args=(threshold_churn,), axis=1)

rfm_B2B["Churn"].value_counts()[1]/rfm_B2B.count()

### CHURN B2C 12MONTHS + percentile

In [None]:
threshold_churn2 = percentile_75th_recency + 365
rfm_B2C['Churn'] = rfm_B2C.apply(define_churn2, args=(threshold_churn2,), axis=1)

rfm_B2C["Churn"].value_counts()[1]/rfm_B2C.count()

In [None]:
# 1 if the customer is considered churned, 0 otherwise

In [None]:
rfm_B2B["Churn"].value_counts(), rfm_B2C["Churn"].value_counts()

## New columns from the transaction dataset to create the churn prediction model

### Channel data

In [None]:
df = pd.get_dummies(df, columns=["order_channel"])

In [None]:

order_channel_columns = ['order_channel_at the store', 'order_channel_by phone', 'order_channel_during the visit of a sales rep', 'order_channel_online', 'order_channel_other']


channel_counts_by_client = df.groupby('client_id')[order_channel_columns].sum()

channel_counts_by_client


In [None]:
B2C_df = rfm_B2C.merge(channel_counts_by_client, on='client_id', how='left')

In [None]:
B2B_df = rfm_B2B.merge(channel_counts_by_client, on='client_id', how='left')

In [None]:
B2C_df

In [None]:
B2C_df= B2C_df.drop([ "recency_score",	"frequency_score",	"monetary_score",	"RFM_SCORE",  "Client_Type"],axis=1)
B2B_df= B2B_df.drop(["recency_score",	"frequency_score",	"monetary_score",	"RFM_SCORE", "Client_Type"],axis=1)

In [None]:
B2C_df

### Branch data

In [None]:
branch_counts = df.groupby('client_id')['branch_id'].nunique().reset_index(name='branch_count')


branch_counts.columns = ['client_id', 'branch_count']


B2B_df = B2B_df.merge(branch_counts, on='client_id', how='left')
B2C_df = B2C_df.merge(branch_counts, on='client_id', how='left')

In [None]:
B2C_df

In [None]:
branch_usage = df.groupby(['client_id', 'branch_id']).size().reset_index(name='usage_count')


branch_usage = branch_usage.sort_values(['client_id', 'usage_count'], ascending=[True, False])


most_used_branches = branch_usage.drop_duplicates(subset='client_id', keep='first')


most_used_branches = most_used_branches[['client_id', 'branch_id']]


most_used_branches.rename(columns={'branch_id': 'most_used_branch'}, inplace=True)

B2B_df = B2B_df.merge(most_used_branches, on='client_id', how='left')
B2C_df = B2C_df.merge(most_used_branches, on='client_id', how='left')

In [None]:
B2C_df

In [None]:
B2B_df

## Check null values

In [None]:
B2B_df[B2B_df.isna()].any()


In [None]:
B2C_df.isnull().sum()

In [None]:
B2C_df.info()

In [None]:
B2C_df['client_id'] = B2C_df['client_id'].astype(str)
B2C_df['most_used_branch'] = B2C_df['most_used_branch'].astype(str)

In [None]:
B2C_df= B2C_df.drop(["Recency"],axis=1)
B2B_df= B2B_df.drop(["Recency"],axis=1)

# Save the datasets in csv

In [None]:
# Save the DataFrame to a CSV file in Google Drive
B2B_df.to_csv('/content/drive/My Drive/B2B_df.csv', index=False)


In [None]:
B2C_df.to_csv('/content/drive/My Drive/B2C_df.csv', index=False)
