# User Segmentation RFM Model

- This exploration looks to use K-means clustering to cluster users based on an RFM (recency, frequency, monetary) model.
- It will clusters users based on the recency and frequency of objective completions, as well as the total number of points that they have claimed.
- Code for K-means clustering is from https://neptune.ai/blog/customer-segmentation-using-machine-learning.

#### Import packages

In [9]:
import os
import pandas as pd
import mysql.connector
import plotly.express as px
from plotly import graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
import numpy as np
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

#chart_studio credentials
chart_studio.tools.set_credentials_file(username = os.environ.get('cs_user'), api_key = os.environ.get('cs_key'))

In [2]:
#Pandas number of rows displayed (Set None for all rows)
#pd.set_option('display.max_rows', 10)

#### SQL Setup

In [3]:
usr = os.environ.get('sql_user')
pwd = os.environ.get('sql_pwd')
host = os.environ.get('sql_host')
db = os.environ.get('sql_db')

def query(sql):
    
    cnx = mysql.connector.connect(user=usr, 
                              password=pwd,
                              host=host,
                              database=db)
    
    df = pd.read_sql(sql, cnx)
    cnx.close()
    return df

## Identifying user features

We will be examining users sponsored by Red River Credit Union, as they have a wide selection of objectives to choose from.

In [4]:
rfm_query = '''
SELECT id AS user_id, DATEDIFF(current_date(), date_claimed) AS recency, COUNT(DISTINCT(name)) as frequency, SUM(points) as points
FROM
(
SELECT u.id, o.name, o.points, oc.date_claimed
FROM objective_claim oc
LEFT JOIN user u ON u.id = oc.user_id
LEFT JOIN objective o ON o.id = oc.objective_id
WHERE u.institution_id = 19634
) RRCU_table
GROUP BY id
'''

rfm_table = query(rfm_query)
rfm_table

Unnamed: 0,user_id,recency,frequency,points
0,913442,267,2,600.0
1,916622,266,1,100.0
2,917246,266,3,2700.0
3,917379,258,3,600.0
4,934769,259,1,200.0
...,...,...,...,...
4140,2212860,1,3,600.0
4141,2213257,1,1,100.0
4142,2213314,1,1,100.0
4143,2213507,0,4,700.0


## Fitting the Model

In [11]:
#Define a K-means model:
kmeans_model = KMeans(init='k-means++',  max_iter=400, random_state=42)

#Train the model:
kmeans_model.fit(rfm_table[['recency','frequency','points']])

KMeans(max_iter=400, random_state=42)

### Finding the optimal number of clusters:

In [13]:
# Create the K means model for different values of K
def try_different_clusters(K, data):
       
    cluster_values = list(range(1, K+1))
    inertias=[]
    
    for c in cluster_values:
        model = KMeans(n_clusters = c,init='k-means++',max_iter=400,random_state=42)
        model.fit(data)
        inertias.append(model.inertia_)
    
    return inertias

In [14]:
# Find output for k values between 1 to 12 
outputs = try_different_clusters(12, rfm_table[['recency','frequency','points']])
distances = pd.DataFrame({"clusters": list(range(1, 13)),"sum of squared distances": outputs})


In [15]:
figure = go.Figure()
figure.add_trace(go.Scatter(x=distances["clusters"], y=distances["sum of squared distances"]))

figure.update_layout(xaxis = dict(tick0 = 1,dtick = 1,tickmode = 'linear'),                  
                  xaxis_title="Number of clusters",
                  yaxis_title="Sum of squared distances",
                  title_text="Finding optimal number of clusters using elbow method")
figure.show()

As the plot above indicates, the optimal number of clusters is 5.

In [25]:
# Re-Train K means model with k=5
kmeans_model_new = KMeans(n_clusters = 5,init='k-means++',max_iter=400,random_state=42)
kmeans_model_new.fit_predict(rfm_table[['recency','frequency','points']])

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [26]:
# Create data arrays
cluster_centers = kmeans_model_new.cluster_centers_
data = np.expm1(cluster_centers)
points = np.append(data, cluster_centers, axis=1)

# Add "cluster" to customers data
points = np.append(points, [[0], [1], [2], [3], [4]], axis=1)
rfm_table["clusters"] = kmeans_model_new.labels_
rfm_table


overflow encountered in expm1



Unnamed: 0,user_id,recency,frequency,points,clusters
0,913442,267,2,600.0,0
1,916622,266,1,100.0,0
2,917246,266,3,2700.0,0
3,917379,258,3,600.0,0
4,934769,259,1,200.0,0
...,...,...,...,...,...
4140,2212860,1,3,600.0,0
4141,2213257,1,1,100.0,0
4142,2213314,1,1,100.0,0
4143,2213507,0,4,700.0,0


## Visualization the clusters

In [28]:
# visualize clusters
cluter_scatter = px.scatter_3d(rfm_table,
                    color='clusters',
                    x="recency",
                    y="frequency",
                    z="points",            
                    category_orders = {"clusters": ["0", "1", "2", "3", "4"]}                    
                    )
cluter_scatter.update_layout()
cluter_scatter.show()

# Export to chart studio
py.plot(cluter_scatter, name = 'Segmentation of RRCU Users', auto_open = True)

'https://plotly.com/~woonggyujin/15/'