<a href="https://colab.research.google.com/github/xquynhtrinh/STA_141C_Final_Project/blob/main/Customer_Segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Customer Segmentation Goals

- Evaluate which customers levels like Loyalist, High Risk, Lost, etc
- R F M for Champotions

In [70]:
!pip install plotly



In [71]:
!pip install nbformat --upgrade



In [72]:
# libraries
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors

In [73]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [74]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/STA 141C/rfm_cleaned_data.csv")
data.head()

Unnamed: 0,Customer ID,Recency,Tenure,Frequency,Monetary,Variety,AOV,Next_90_Days_Purchase
0,12346.0,235,635,12,77556.46,27,6463.038333,0
1,12347.0,39,313,6,4114.18,107,685.696667,1
2,12348.0,158,347,4,1709.4,25,427.35,1
3,12349.0,317,498,3,2671.14,90,890.38,1
4,12350.0,219,219,1,334.4,17,334.4,0


In [75]:
# Drop cols except RFM, & Next_90_Days
data = data.drop(['Tenure', 'Variety', 'AOV'], axis=1)
data.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Next_90_Days_Purchase
0,12346.0,235,12,77556.46,0
1,12347.0,39,6,4114.18,1
2,12348.0,158,4,1709.4,1
3,12349.0,317,3,2671.14,1
4,12350.0,219,1,334.4,0


## Define Quantiles

In [76]:
quantiles = data.quantile(q=[.25, .5, .75])

# Assign rfm scores
def r_score(x, p, d):
  if p == 'Recency':
    if x <= d[p][.25]:
      return 4
    elif x <= d[p][.50]:
      return 3
    elif x <= d[p][.75]:
      return 2
    else:
      return 1
  else:
    if x <= d[p][.25]:
      return 1
    elif x <= d[p][.50]:
      return 2
    elif x <= d[p][.75]:
      return 3
    else:
      return 4

data['R'] = data['Recency'].apply(r_score, args=('Recency', quantiles))
data['F'] = data['Frequency'].apply(r_score, args=('Frequency', quantiles))
data['M'] = data['Monetary'].apply(r_score, args=('Monetary', quantiles))

data.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Next_90_Days_Purchase,R,F,M
0,12346.0,235,12,77556.46,0,2,4,4
1,12347.0,39,6,4114.18,1,4,3,4
2,12348.0,158,4,1709.4,1,3,3,3
3,12349.0,317,3,2671.14,1,2,2,4
4,12350.0,219,1,334.4,0,2,1,2


In [77]:
# Combine RFM numbers as string & score
data['RFM_Segment'] = data.R.map(str) + data.F.map(str) + data.M.map(str)
data['RFM_Score'] = data[['R', 'F', 'M']].sum(axis=1)
data.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Next_90_Days_Purchase,R,F,M,RFM_Segment,RFM_Score
0,12346.0,235,12,77556.46,0,2,4,4,244,10
1,12347.0,39,6,4114.18,1,4,3,4,434,11
2,12348.0,158,4,1709.4,1,3,3,3,333,9
3,12349.0,317,3,2671.14,1,2,2,4,224,8
4,12350.0,219,1,334.4,0,2,1,2,212,5


## Segment Labels

In [78]:
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']

def assign_segment(score):
  if score < 5:
    return segment_labels[0]
  elif score < 9:
    return segment_labels[1]
  else:
    return segment_labels[2]

data['RFM_Segment_Label'] = data['RFM_Score'].apply(assign_segment)

data.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Next_90_Days_Purchase,R,F,M,RFM_Segment,RFM_Score,RFM_Segment_Label
0,12346.0,235,12,77556.46,0,2,4,4,244,10,High-Value
1,12347.0,39,6,4114.18,1,4,3,4,434,11,High-Value
2,12348.0,158,4,1709.4,1,3,3,3,333,9,High-Value
3,12349.0,317,3,2671.14,1,2,2,4,224,8,Mid-Value
4,12350.0,219,1,334.4,0,2,1,2,212,5,Mid-Value


## Count Each Segment & Create Bar Chart

In [79]:
segment_count = data['RFM_Segment_Label'].value_counts().reset_index()
segment_count.columns = ['RFM_Segment', 'Count']
segment_count = segment_count.sort_values('RFM_Segment')

# bar chart
fig = px.bar(segment_count,
             x = 'RFM_Segment',
             y = 'Count',
             title = 'Count of Customers in Each RFM Segment',
             labels = {'RFM_Segment': 'RFM Segment', 'Count': 'Number of Customers'},
             color = 'RFM_Segment',
             color_discrete_sequence = px.colors.qualitative.Pastel
             )

fig.update_layout(showlegend=False)
fig.show()

In [88]:
data['RFM_Customer_Segments'] = ''

data.loc[data['RFM_Score'] >= 9, 'RFM_Customer_Segments'] = 'Champions'
data.loc[(data['RFM_Score'] >=6) & (data['RFM_Score'] <9), 'RFM_Customer_Segments'] = 'Potential Loyal'
data.loc[(data['RFM_Score'] >=5) & (data['RFM_Score'] <6), 'RFM_Customer_Segments'] = 'At Risk'
data.loc[(data['RFM_Score'] >=4) & (data['RFM_Score'] <5), 'RFM_Customer_Segments'] = "Can't Lose"
data.loc[(data['RFM_Score'] >=3) & (data['RFM_Score'] <4), 'RFM_Customer_Segments'] = 'Lost'
segment_count = data['RFM_Customer_Segments'].value_counts().sort_index()

segment_product_counts = data.groupby(['RFM_Segment_Label', 'RFM_Customer_Segments']).size().reset_index(name='Count')
segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

In [89]:
# Treemap
fig_treemap_segment_product = px.treemap(segment_product_counts,
                                         path = ['RFM_Segment_Label', 'RFM_Customer_Segments'],
                                         values = 'Count',
                                         color = 'RFM_Segment_Label',
                                         color_discrete_sequence=px.colors.qualitative.Pastel,
                                         title='RFM Customer Segments by Value'
)
fig_treemap_segment_product.show()

## Box Plot RFM for Champions

In [90]:
vip_segment = data[data['RFM_Customer_Segments'] == 'Champions']

fig = go.Figure()

fig.add_trace(go.Box(y=vip_segment['Recency'], name='Recency'))
fig.add_trace(go.Box(y=vip_segment['Frequency'], name='Frequency'))
fig.add_trace(go.Box(y=vip_segment['Monetary'], name='Monetary'))

fig.update_layout(showlegend=False, title='RFM Values Distribution for Champions', yaxis_title='Value')
fig.show()

In [91]:
correlation_matrix = vip_segment[['R', 'F', 'M']].corr()

fig_heatmap = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    colorbar=dict(title='Correlation')
))

fig_heatmap.update_layout(title='Correlation Heatmap for RFM Values')
fig_heatmap.show()

In [92]:
pastel_colors = plotly.colors.qualitative.Pastel
fig = go.Figure(data=[go.Bar(x=segment_count.index, y=segment_count.values,
                             marker=dict(color=pastel_colors))])

# Set color of champisons as different color
vip_color = 'rgb(158, 202, 225)'
fig.update_traces(marker_color=[vip_color if segment == 'Champions' else pastel_colors[i]
                                for i, segment in enumerate(segment_count.index)],
                  marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6
)

fig.update_layout(title='Comparison of RFM Segments',
                  xaxis_title='RFM Segment',
                  yaxis_title='Number of Custimers',
                  showlegend=False
)

fig.show()

In [93]:
segment_scores = data.groupby('RFM_Customer_Segments')[['R','F','M']].mean().reset_index()
fig = go.Figure()

# Add bars for R score
fig.add_trace(go.Bar(x=segment_scores['RFM_Customer_Segments'],
                     y=segment_scores['R'],
                     name='Recency Score',
                     marker_color='rgb(158,202,225)'
))

# bars for F score
fig.add_trace(go.Bar(x=segment_scores['RFM_Customer_Segments'],
                     y=segment_scores['F'],
                     name='Frequency Score',
                     marker_color='rgb(94,158,217)'
))

# bars for M score
fig.add_trace(go.Bar(x=segment_scores['RFM_Customer_Segments'],
                     y=segment_scores['M'],
                     name='Monetary Score',
                     marker_color='rgb(32,102,148)'
))

fig.update_layout(title='Comparison of RFM Segments based on R, F, M Scores',
                   xaxis_title='RFM Segment',
                   yaxis_title='Score',
                   barmode='group',
                   showlegend=True
)

fig.show()

## Adjust the table for predictive modeling use later


In [98]:
data = data.drop(['RFM_Segment', 'RFM_Score' ,'RFM_Segment_Label'], axis=1)
data.head()

Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Next_90_Days_Purchase,RFM_Customer_Segments
0,12346.0,235,12,77556.46,0,Champions
1,12347.0,39,6,4114.18,1,Champions
2,12348.0,158,4,1709.4,1,Champions
3,12349.0,317,3,2671.14,1,Potential Loyal
4,12350.0,219,1,334.4,0,At Risk


In [99]:
# Save the dataset with your manual RFM segments included
save_path = "/content/drive/MyDrive/Colab Notebooks/STA 141C/manual_segments_data.csv"

# Assuming your dataframe is named 'data' based on your PDF
data.to_csv(save_path, index=False)
print(f"Data saved successfully to: {save_path}")

Data saved successfully to: /content/drive/MyDrive/Colab Notebooks/STA 141C/manual_segments_data.csv
