# Rolling up Offer and Transaction Data for Clustering 

In [1]:
import pandas as pd
import os
import pandasql as ps

ModuleNotFoundError: No module named 'pandasql'

In [None]:
os.chdir('path to whatever folder contains transaction_transformed, offers_transformed csvs')

## Query for Offer Related Information

In [None]:
offers = pd.read_csv('offers_transformed.csv')

In [None]:
offers.head()

In [None]:
query = """
SELECT person, IFNULL(SUM(viewed), 0) as total_offer_views, IFNULL(SUM(completed), 0) as total_completed_offers, 
IFNULL(SUM(reward), 0) as total_rewards_received, IFNULL(AVG(amount), 0) as average_offer_payment
FROM offers
GROUP BY person
"""

offer_rollup = ps.sqldf(query = query)

## Transaction Queries

In [None]:
transactions = pd.read_csv("transactions_transformed.csv")

In [None]:
num_transactions = transactions.groupby(by = 'person').sum()[['transaction']].reset_index().rename({'transaction':'number_of_transactions'}, axis = 1)

In [None]:
query = """
SELECT person, IFNULL(AVG(amount), 0) as average_transaction_amount
FROM transactions
GROUP BY person
"""
average_transactions = ps.sqldf(query = query)

## Join Data Together

In [None]:
query = """
SELECT t1.person as person_id, * 
FROM offer_rollup t1 JOIN num_transactions t2 on t1.person = t2.person 
JOIN average_transactions t3 ON t1.person = t3.person
"""

full_profile_clustering = ps.sqldf(query = query)

In [None]:
full_profile_clustering.drop(columns = 'person', inplace = True)

## Checking Correlation

In [None]:
# check for correlations briefly
full_profile_clustering.corr()

## Export Data

In [None]:
full_profile_clustering.to_csv('user_activity_rollup.csv', index = False)

## KMeans Clustering

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#bring in rolled up data by person id
cluster_data = pd.read_csv("user_activity_rollup.csv")

In [3]:
cluster_data.head()

Unnamed: 0,person_id,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount
0,0009655768c64bdeb2e877511632db8f,4,3,9.0,10.983333,8,15.95
1,00116118485d4dfda04fdbaba9a87b5c,4,0,0.0,0.0,3,1.363333
2,0011e0d4e6b944f998e987f904e8c1e5,5,3,13.0,18.676667,6,16.918333
3,0020c2b971eb4e9188eac86d93036a77,6,5,18.0,17.396,11,22.633636
4,0020ccbbb6d84e358d3414a3ff76cffd,4,3,13.0,12.166667,12,12.8375


In [4]:
cluster_data.describe()

Unnamed: 0,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount
count,16572.0,16572.0,16572.0,16572.0,16572.0,16572.0
mean,5.847152,4.06692,20.09323,14.697263,10.603005,13.976243
std,5.067047,5.434291,32.699255,26.159824,7.428972,17.716807
min,0.0,0.0,0.0,0.0,1.0,0.05
25%,3.0,1.0,2.0,2.79,5.0,3.313
50%,4.0,2.0,10.0,12.45375,9.0,12.320909
75%,7.0,5.0,24.0,20.570833,14.0,20.551259
max,75.0,75.0,660.0,1015.73,88.0,533.606154


In [5]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

#Split data into train and test for clustering
X = cluster_data.drop(columns = "person_id")
y = cluster_data["person_id"]

#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = .2, random_state = 1)

In [6]:
#Performing kmeans and attempting different numbers of clusters

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
                for k in range(1, 12)]

In [7]:
from sklearn.metrics import silhouette_score

silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]

KeyboardInterrupt: 

In [None]:
#Plotting the scores for each number of calculated Kmeans clusters

plt.figure(figsize=(8, 3))
plt.plot(range(1, 11), silhouette_scores, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.axis([0, 12, 0.2, .8])
#save_fig("silhouette_score_vs_k_plot")
plt.show()

In [None]:
inertias = [model.inertia_ for model in kmeans_per_k]

print(inertias)

In [None]:
plt.figure(figsize=(8, 3.5))
plt.plot(range(1, 12), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.axis([0, 12, 1000000, 40000000])
#save_fig("inertia_vs_k_plot")
plt.show()

Based on the inertia and the the silhouette scores of the different number of clusters, the best number of clusters is 2 or 3 clusters. 2 clusters have the largest silhouette score (not counting 1), and 3 is the elbow point of the models inertia and also has a higher silhouette score.

## Standardization of clusters

In [9]:
from sklearn.preprocessing import StandardScaler
from pandas.plotting import scatter_matrix

scaler = StandardScaler()
scaled_model = scaler.fit_transform(X)


In [10]:
columns = scaled_model.columns
columns

#log_transform = [np.log(scaled_model["{}".format(name)]) for name in columns]
#df['logStreams'] = np.log10(df['Streams'])

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [11]:
X.head()

Unnamed: 0,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount
0,4,3,9.0,10.983333,8,15.95
1,4,0,0.0,0.0,3,1.363333
2,5,3,13.0,18.676667,6,16.918333
3,6,5,18.0,17.396,11,22.633636
4,4,3,13.0,12.166667,12,12.8375


In [None]:
log = pd.DataFrame(log_transform)
log

In [None]:

scatter_matrix(pd.DataFrame(scaled_model), figsize = (10,10))

In [None]:
pd.DataFrame(scaled_model).head()

In [None]:
#Performing kmeans and attempting different numbers of clusters

kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(scaled_model)
                for k in range(1, 12)]

In [None]:
from sklearn.metrics import silhouette_score

silhouette_scores = [silhouette_score(scaled_model, model.labels_)
                     for model in kmeans_per_k[1:]]

In [None]:
#Plotting the scores for each number of calculated Kmeans clusters

plt.figure(figsize=(8, 3))
plt.plot(range(1, 11), silhouette_scores, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Silhouette score", fontsize=14)
plt.axis([0, 12, 0.2, .6])
#save_fig("silhouette_score_vs_k_plot")
plt.show()

In [None]:
inertias = [model.inertia_ for model in kmeans_per_k]

print(inertias)

In [None]:
plt.figure(figsize=(8, 3.5))
plt.plot(range(1, 12), inertias, "bo-")
plt.xlabel("$k$", fontsize=14)
plt.ylabel("Inertia", fontsize=14)
plt.axis([0, 12, 10000, 110000])
#save_fig("inertia_vs_k_plot")
plt.show()

### Clustering results

In [12]:
kmeans2 = KMeans(n_clusters = 2)
kmeans3 = KMeans(n_clusters = 3)

kmeans2.fit_predict(scaled_model)
kmeans3.fit_predict(scaled_model)

labels2 = kmeans2.labels_
labels3 = kmeans3.labels_

cluster_data2 = cluster_data
cluster_data2['cluster_num'] = labels2
cluster_data2.head()


Unnamed: 0,person_id,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount,cluster_num
0,0009655768c64bdeb2e877511632db8f,4,3,9.0,10.983333,8,15.95,1
1,00116118485d4dfda04fdbaba9a87b5c,4,0,0.0,0.0,3,1.363333,1
2,0011e0d4e6b944f998e987f904e8c1e5,5,3,13.0,18.676667,6,16.918333,1
3,0020c2b971eb4e9188eac86d93036a77,6,5,18.0,17.396,11,22.633636,1
4,0020ccbbb6d84e358d3414a3ff76cffd,4,3,13.0,12.166667,12,12.8375,1


In [13]:
#Group by the clusters 
cluster_data2.groupby('cluster_num').mean()

Unnamed: 0_level_0,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount
cluster_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12.861799,12.34848,63.172987,26.336937,20.258853,22.880141
1,4.174352,2.091996,9.819894,11.921521,8.300351,11.852909


In [14]:
cluster_data3 = cluster_data
cluster_data3['cluster_num'] = labels3
cluster_data3.head()

Unnamed: 0,person_id,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount,cluster_num
0,0009655768c64bdeb2e877511632db8f,4,3,9.0,10.983333,8,15.95,1
1,00116118485d4dfda04fdbaba9a87b5c,4,0,0.0,0.0,3,1.363333,1
2,0011e0d4e6b944f998e987f904e8c1e5,5,3,13.0,18.676667,6,16.918333,1
3,0020c2b971eb4e9188eac86d93036a77,6,5,18.0,17.396,11,22.633636,1
4,0020ccbbb6d84e358d3414a3ff76cffd,4,3,13.0,12.166667,12,12.8375,1


In [15]:
#Group by the clusters
cluster_data3.groupby('cluster_num').mean()

Unnamed: 0_level_0,total_offer_views,total_completed_offers,total_rewards_received,average_offer_payment,number_of_transactions,average_transaction_amount
cluster_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.746988,6.180723,28.903614,285.471721,11.036145,178.292783
1,4.172581,2.093612,9.83093,12.047559,8.283825,11.977261
2,13.044137,12.519652,64.110503,18.882831,20.592139,18.202435


In [16]:
cluster_data3.cluster_num.value_counts()

1    13385
2     3104
0       83
Name: cluster_num, dtype: int64

## Linear Discriminant Analysis

Now we compute the linear discriminant analysis based off the clusters made previously

In [None]:
#first we want to add in the discrimnant variables of the dataset. Age, income, gender, and when they became a member
profile = pd.read_csv("https://raw.githubusercontent.com/mitchell-jones/starbucks-6276/main/data/processed/profile_transformed.csv")

#We remove the extra index and the scaled membership days and when they became a member
profile = profile.drop(columns = ["Unnamed: 0", "membership_length_scaled", "became_member_on", "became_member_on_converted"])
profile.head()

In [None]:
#Merge the profile data to the cluster result data
lda_data = profile.merge(cluster_data3, how = "inner", left_on = "id", right_on = "person_id")

#remove repeat of id
lda_data = lda_data.drop(columns = "id")


In [None]:
#since we don't have enough info on "other" in gender to create a conclusive analysis, we will remove it and convert gender into a binary variable
lda_data.gender.value_counts()

# converting to binary data for easier analysis
df_one = pd.get_dummies(lda_data["gender"])

 
# display result
df_two = pd.concat((df_one, lda_data), axis=1)
df_two = df_two.drop(["gender", 'O'], axis=1)
df_two = df_two.drop(["M"], axis=1)
#convert binary data so if gender = 1 is female and gender = 0 is male
lda_updated = df_two.rename(columns={"F": "gender"})

lda_updated.head()


In [None]:
#Create X and y for lda 
#For LDA we want to focus on the discriminant variables and their relation to the cluster numbers
y = lda_updated.cluster_num
X = lda_updated[["gender", "age", "income", "membership_length_days"]]

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# create the lda model
lda = LinearDiscriminantAnalysis()

#Define the evaluation method using kfolds
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=1)

In [None]:
lda.fit(X, y)

In [None]:
# evaluate model
scores = cross_val_score(estimator = lda, X = X, y = y,
                         scoring='accuracy',
                         cv=cv, n_jobs=-1)
# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
# Evaluate result
means = pd.DataFrame(lda.means_, columns = ["gender", "age", "income", "membership_length_days"])

round(means, 3)

1. Age doesnt seem to influence behavior that much. 
2. Income does separate cluster 2 from the others.
3. Cluster 1 has longer membership
4. Cluster 0 is closer to male