# Customer Segmentation Analysis - Data Analysis

# Part 1 - Marketing Campaign

In [None]:
#packages
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
import pyperclip as pc

# Loading and checking the data

We have now our Analytical Base Table (ABT) that we will use to perform analysis for this first part. Some variables were created to help us:

    - "email": Mail" indicator, values of "1" indicate that a customer received a communcation, while no value indicates that the customer was not contacted (i.e. that the customer was in the "Control" group). The same as MilInd from the raw data, but now we will use it as a string

    - "spent": how much ($) the user spent in our company, but we put 0 to all users that didnt buy anything

    - "shopper": indicates whether or not the user made a purchase

    - "negative": whether the user has negative value for "spent" or not (it is weird that some users have it so we will try to understand why it happens)

In [None]:
## Loading the data from a .csv file
data = pd.read_csv("customer_segmentation_ABT.csv")

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110000 entries, 0 to 109999
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ID         110000 non-null  int64  
 1   ProfileID  65946 non-null   float64
 2   SegmentID  110000 non-null  int64  
 3   version    110000 non-null  object 
 4   ind        100000 non-null  float64
 5   email      110000 non-null  int64  
 6   spent      110000 non-null  float64
 7   shopper    110000 non-null  int64  
 8   negative   110000 non-null  object 
 9   profile    65946 non-null   object 
 10  segment    110000 non-null  object 
dtypes: float64(3), int64(4), object(4)
memory usage: 9.2+ MB


#### We have 110000 records and the only variable with NULLs now is Profile

In [None]:
data.head()

Unnamed: 0,ID,ProfileID,SegmentID,version,ind,email,spent,shopper,profile,segment
0,1,5.0,5,A,1.0,1,0.0,0,Pinched Pockets,New Customers
1,2,,5,A,1.0,1,0.0,0,,New Customers
2,3,2.0,3,B,1.0,1,0.0,0,Rich & Richer,Power Shoppers
3,4,1.0,2,A,1.0,1,0.0,0,City Slickers,Core Customers
4,5,4.0,1,A,1.0,1,0.0,0,Blue Collar Royalty,Elite Customers


In [None]:
#turning email into a string
data['email'] = data['email'].apply(str)

In [None]:
#turning negative into a string
data['negative'] = data['negative'].apply(str)

#### Now, let's check if the two groups are about the same size

In [None]:
# Print out the percentage of users in each group
print(data["version"].value_counts() / data["version"].count() * 100)

A    50.200909
B    49.799091
Name: version, dtype: float64


In [None]:
# Print out the percentage of users that are in a group control
print(data["email"].value_counts() / data["email"].count() * 100)

1    90.909091
0     9.090909
Name: email, dtype: float64


#### We almost have exactly a proper split 50%50% between the 2 groups A and B

#### 10% hasn't recived the communication (control group)

# Metrics Calculation

We need to define our sucess metric and additional metrics so we can measure success

Our success metric will be:
##### Conversion Rate
    For every user that has ever been part of the campaigns,we calculate how many of them did a purchase
We will also look at this one:
##### Total spent
    For every user that has ever been part of the campaigns,we calculate how much that user spent

In [None]:
# Calculate the conversion rate by version and email
cr = data.groupby(by=["email", "version"], as_index=False)
cr = cr.agg({"shopper": ["count", "sum"]})
cr["conversion_rate"] = (
    cr.shopper["sum"] / cr.shopper["count"]
) * 100

In [None]:
cr = cr.drop(columns=["shopper"])

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [None]:
cr

Unnamed: 0,email,version,conversion_rate
,,,
0.0,0.0,A,7.027559
1.0,0.0,B,7.398374
2.0,1.0,A,7.271494
3.0,1.0,B,7.755872


#### We can see little differences on the conversion rate for each group A but we need do to the A/B test to confirm if this differences are real (and not just a coincidence).

In [None]:
cr.to_clipboard()

In [None]:
cr.to_csv('cr.csv',index=False)

In [None]:
cr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   (email, )            4 non-null      object 
 1   (version, )          4 non-null      object 
 2   (conversion_rate, )  4 non-null      float64
dtypes: float64(1), object(2)
memory usage: 128.0+ bytes


In [None]:
cr = pd.read_csv("customer_segmentation_CR.csv")

In [None]:
cr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   email            4 non-null      int64  
 1   version          4 non-null      object 
 2   conversion_rate  4 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 224.0+ bytes


In [None]:
# Calculate the total spent by version and email - everybody
data.groupby(['version','email'])['spent'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
version,email,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A,0,5080.0,8.302624,45.386802,-502.980011,0.0,0.0,0.0,948.099976
A,1,50141.0,8.57792,44.83767,-479.720001,0.0,0.0,0.0,1616.459961
B,0,4920.0,7.820827,38.922911,-282.709991,0.0,0.0,0.0,713.76001
B,1,49859.0,8.314443,42.32125,-768.820007,0.0,0.0,0.0,1216.130005


##### It does not make sense - we need to calculate only for those who actually have bought

In [None]:
# Calculate the total spent by version and email -- filtering only shopper = 1
total_spent = data[data["shopper"]==1].groupby(by=["email", "version"], as_index=False)
total_spent = total_spent.agg({"spent": ["sum", "mean","median"]})
total_spent

Unnamed: 0_level_0,email,version,spent,spent,spent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,median
0,0,A,42177.329932,118.143781,91.730003
1,0,B,38478.469986,105.710082,86.009998
2,1,A,430105.469889,117.966393,96.629997
3,1,B,414549.83012,107.201921,85.309998


In [None]:
#rounding the total_spent values to 0 decimal points
total_spent = total_spent.round()
total_spent

Unnamed: 0_level_0,email,version,spent,spent,spent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,median
0,0,A,42177.0,118.0,92.0
1,0,B,38478.0,106.0,86.0
2,1,A,430105.0,118.0,97.0
3,1,B,414550.0,107.0,85.0


##### We can see little differences on total_spent values for each group but we need do to the A/B test to confirm if this differences are real (and not just a coincidence).

#### ==================================================================
# A/B test
#### ==================================================================

## Define the hypothesis
    What will this campaign improve?

    - Hypothesis 1: users receive  the promotional material A are more likely to purchase a product than users that doesn't receive  the same material OR
                users that receive  the promotional material A are more likely to spent more money buying a product than users that doesn't receive  the same material


    - Hypothesis 2: users receive  the promotional material B are more likely to purchase a product than users that doesn't receive  the same material OR
                users that receive  the promotional material A are more likely to spent more money buying a product than users that doesn't receive  the same material


    - Hypothesis 3: users receive the promotional material A are more likely to purchase a product than users that receive  the promotional material B
    OR
                users that receive  the promotional material A are more likely to spent more money buying a product than users that receive  the promotional material B

## Test 1 - Version: A

In [None]:
cr[cr["version"]=="A"]

Unnamed: 0_level_0,email,version,shopper,shopper,conversion_rate
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,sum,Unnamed: 5_level_1
0,0,A,5080,357,7.03
2,1,A,50141,3646,7.27


In [None]:
#testing if is there any difference between who got the email A and who doesnt in becoming a shopper or not
res = ttest_ind(data.query('version == "A" and email == 0')['shopper'].to_numpy(),
                data.query('version == "A" and email == 1')['shopper'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.5228792092963581
The difference is not significant. There is no diference between the treatments


### Conclusion - Test 1:
##### we can not say that the users that has received the promotional material A were more likely to purchase a product than users that didn't received  the same material

In [None]:
total_spent[total_spent["version"]=="A"]

Unnamed: 0_level_0,email,version,spent,spent,spent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,median
0,0,A,42177.0,118.0,92.0
2,1,A,430105.0,118.0,97.0


In [None]:
#testing if is there any difference between who got the email A and who doesnt in how much they have spent
res = ttest_ind(data.query('version == "A" and email == 0 and shopper == 1')['spent'].to_numpy(),
                data.query('version == "A" and email == 1 and shopper == 1')['spent'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.9790897785030468
The difference is not significant. There is no difference between the treatments


### Conclusion 2 - Test 1:
##### we can not say that the users that has receive the promotional material A were more likely to spent more money buying a product than users that didn't receive the same material

## Test 2 - Version: B

In [None]:
cr[cr["version"]=="B"]

Unnamed: 0_level_0,email,version,shopper,shopper,conversion_rate
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,sum,Unnamed: 5_level_1
1,0,B,4920,364,7.4
3,1,B,49859,3867,7.76


In [None]:
#testing if is there any difference between who got the email B and who doesnt in becoming a shopper or not
res = ttest_ind(data.query('version == "B" and email == 0')['shopper'].to_numpy(),
                data.query('version == "B" and email == 1')['shopper'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.3702035607009382
The difference is not significant. There is no difference between the treatments


### Conclusion 1 - Test 2:
##### we can not say that the users that has received the promotional material B were more likely to purchase a product than users that didn't received the same material

In [None]:
total_spent[total_spent["version"]=="B"]

Unnamed: 0_level_0,email,version,spent,spent,spent
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,median
1,0,B,38478.0,106.0,86.0
3,1,B,414550.0,107.0,85.0


In [None]:
#testing if is there any difference between who got the email B and who doesnt in how much they have spent
res = ttest_ind(data.query('version == "B" and email == 0 and shopper == 1')['spent'].to_numpy(),
                data.query('version == "B" and email == 1 and shopper == 1')['spent'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.8061522336285047
The difference is not significant. There is no difference between the treatments


### Conclusion 2 - Test 2:
##### we can not say that the users that has received the promotional material B were more likely to spent more money buying a product than users that didn't received the same material

## Test 3 - Version: A  x Version: B

In [None]:
data.groupby(['version','shopper']).size()

version  shopper
A        0          51218
         1           4003
B        0          50548
         1           4231
dtype: int64

In [None]:
#testing if is there any difference between who got the email A and who got the email B in becoming a shopper or not
res = ttest_ind(data.query('version == "A"')['spent'].to_numpy(),
                data.query('version == "B"')['spent'].to_numpy()).pvalue

print(res*100)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

28.138952063539925
The difference is not significant. There is no difference between the treatments


### Conclusion 1 - Test 3:

##### we can not say that the users that has received the promotional material A are more likely to purchase a product than users that received the promotional material B

In [None]:
data.query('shopper == 1').groupby('version')['spent'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,4003.0,117.982213,122.018116,-502.980011,57.980001,96.089996,158.540001,1616.459961
B,4231.0,107.073576,110.866615,-768.820007,51.575001,85.419998,144.775002,1216.130005


In [None]:
data.query('shopper == 1').groupby('version')['spent'].sum()

version
A    472282.799821
B    453028.300106
Name: spent, dtype: float64

In [None]:
res = ttest_ind(data.query('version == "A" and shopper == 1')['spent'].to_numpy(),
                data.query('version == "B" and shopper == 1')['spent'].to_numpy()).pvalue

print(res*100)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.0021652873095464914
The difference is significant. There is differece between the treatments.


### Conclusion 2 - Test 3:

#####  We might say that there users that received  the promotional material A are more likely to spent more money buying a product than users that received the promotional material B but we can not say that it is because of the material

In [None]:
t, p = ttest_ind(
    data.loc[data['version'] == "A", 'shopper'].values,
    data.loc[data['version'] == "B", 'shopper'].values,
    equal_var = False)

print(f't-value = {str(t)}')
print(f'p-value = {str(p)}')

if p >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

t-value = -2.9911557588331403
p-value = 0.002779853295946893
The difference is significant. There is differece between the treatments.


## Segments

In [None]:
# Calculate the conversion rate by version and segment
cr_segments = data.groupby(by=["segment","email","version"], as_index=False)
cr_segments = cr_segments.agg({"shopper": ["count", "sum"]})
cr_segments["conversion_rate"] = (
    cr_segments.shopper["sum"] / cr_segments.shopper["count"]
) * 100

In [None]:
cr_segments

Unnamed: 0_level_0,segment,email,version,shopper,shopper,conversion_rate
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,sum,Unnamed: 6_level_1
0,Core Customers,0,A,1553,125,8.048938
1,Core Customers,0,B,1418,136,9.590973
2,Core Customers,1,A,15186,1252,8.244436
3,Core Customers,1,B,15165,1349,8.895483
4,Elite Customers,0,A,762,74,9.711286
5,Elite Customers,0,B,268,21,7.835821
6,Elite Customers,1,A,7394,665,8.993779
7,Elite Customers,1,B,2534,248,9.786898
8,Infrequent Customers,0,A,260,13,5.0
9,Infrequent Customers,0,B,734,49,6.675749


In [None]:
res = ttest_ind(data.query('segment == "Infrequent Customers" and email == 1')['shopper'].to_numpy(),
                data.query('segment == "Infrequent Customers" and email == 0')['shopper'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.3473332759107197
The difference is not significant. There is no difference between the treatments


In [None]:
res = ttest_ind(data.query('version == "B" and segment == "New Customers" ')['shopper'].to_numpy(),
                data.query('version == "A" and segment == "New Customers" ')['shopper'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

7.917146028685101e-05
The difference is significant. There is differece between the treatments.


Core Customers
New Customers

In [None]:
#calculating conversion rate for New Customers
cr_a = data.query('version == "A" and segment == "New Customers" ')['shopper'].sum() /  data.query('version == "A" and segment == "New Customers" ')['ID'].count()
(cr_a*100).round(2)

cr_b =data.query('version == "B" and segment == "New Customers" ')['shopper'].sum() /  data.query('version == "B" and segment == "New Customers" ')['ID'].count()
(cr_b*100).round(2)

print(cr_a*100)
print(cr_b*100)


#calculating the difference between the convertion rates
diff = ((cr_b / cr_a ) - 1)*100
print(diff)

5.731846118825404
6.639857423570808
15.84151573370358


In [None]:
#calculating conversion rate for New Customers
cr_a = data.query('version == "A" and segment == "Core Customers" ')['shopper'].sum() /  data.query('version == "A" and segment == "Core Customers" ')['ID'].count()
(cr_a*100).round(2)

cr_b =data.query('version == "B" and segment == "Core Customers" ')['shopper'].sum() /  data.query('version == "B" and segment == "Core Customers" ')['ID'].count()
(cr_b*100).round(2)

print(cr_a*100)
print(cr_b*100)


#calculating the difference between the convertion rates
diff = ((cr_b / cr_a ) - 1)*100
print(diff)

8.22629786725611
8.954953868419466
8.857641832587815


### Profiles

In [None]:
# Calculate the conversion rate by version and segment
cr_profiles = data.groupby(by=["profile","version"], as_index=False)
cr_profiles = cr_profiles.agg({"shopper": ["count", "sum"]})
cr_profiles["conversion_rate"] = (
    cr_profiles.shopper["sum"] / cr_profiles.shopper["count"]
) * 100

In [None]:
cr_profiles

Unnamed: 0_level_0,profile,version,shopper,shopper,conversion_rate
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,sum,Unnamed: 5_level_1
0,Blue Collar Royalty,A,5531,501,9.058037
1,Blue Collar Royalty,B,5525,495,8.959276
2,City Slickers,A,5635,569,10.097604
3,City Slickers,B,5475,601,10.977169
4,Mr. & Mrs. Smiths,A,5553,147,2.647218
5,Mr. & Mrs. Smiths,B,5476,169,3.086194
6,Normal Families,A,5521,509,9.219344
7,Normal Families,B,5511,499,9.054618
8,Pinched Pockets,A,5423,410,7.560391
9,Pinched Pockets,B,5406,449,8.305586


In [None]:
res = ttest_ind(data.query('version == "A" and profile == "Blue Collar Royalty"')['shopper'].to_numpy(),
                data.query('version == "B" and profile == "Blue Collar Royalty"')['shopper'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.8561075192957218
The difference is not significant. There is no difference between the treatments


In [None]:
res = ttest_ind(data.query('version == "A" and profile == "Blue Collar Royalty"')['shopper'].to_numpy(),
                data.query('version == "B" and profile == "Blue Collar Royalty"')['shopper'].to_numpy()).pvalue

print(res)
if res >=0.05:
    print("The difference is not significant. There is no difference between the treatments")
else:
    print("The difference is significant. There is differece between the treatments.")

0.8561075192957218
The difference is not significant. There is no difference between the treatments


## ==============================================================

# Part 2 - Department analysis

## ==============================================================


In [None]:
#let's read the database DEPTS
dpt = pd.read_csv("customer_segmentation_DEPTS.csv")

In [None]:
dpt

Unnamed: 0,DepartmentID,Year,SegmentID,ProfileID,Sales,Customers,department,profile,segment
0,A,1999,1,,1989.77,347,Shirts,,Elite Customers
1,A,1999,1,1.0,6270.30,616,Shirts,City Slickers,Elite Customers
2,A,1999,1,2.0,7131.06,657,Shirts,Rich & Richer,Elite Customers
3,A,1999,1,3.0,7293.91,665,Shirts,Mr. & Mrs. Smiths,Elite Customers
4,A,1999,1,4.0,1995.79,348,Shirts,Blue Collar Royalty,Elite Customers
...,...,...,...,...,...,...,...,...,...
695,J,2000,5,2.0,20734.50,3380,Misc,Rich & Richer,New Customers
696,J,2000,5,3.0,31718.50,4181,Misc,Mr. & Mrs. Smiths,New Customers
697,J,2000,5,4.0,39.53,148,Misc,Blue Collar Royalty,New Customers
698,J,2000,5,5.0,48.21,163,Misc,Pinched Pockets,New Customers


### Further analysis here:

### https://docs.google.com/spreadsheets/d/1DEJDVPyDggDqbxdqSE1vUflt8_dhAkUKtVtPI9u37wM/edit?usp=sharing

## Negative values

Let's explore the negative values to understand more about that

In [None]:
data.loc[data["negative"].isin(['1'])].groupby(['profile']).size()

profile
Blue Collar Royalty    44
City Slickers          54
Mr. & Mrs. Smiths      13
Normal Families        63
Pinched Pockets        47
Rich & Richer          56
dtype: int64

In [None]:
##negative values
data.groupby(['profile','negative'])['spent'].sum()

profile              negative
Blue Collar Royalty  0           130478.529778
                     1            -6307.819988
City Slickers        0           208219.099846
                     1           -10784.939968
Mr. & Mrs. Smiths    0            82044.789909
                     1            -4159.570023
Normal Families      0           108679.790188
                     1            -7556.189987
Pinched Pockets      0            57282.589927
                     1            -3261.849997
Rich & Richer        0           153672.840134
                     1            -8648.999973
Name: spent, dtype: float64

In [None]:
x=data.groupby(['profile','negative'])['spent'].describe()

In [None]:
data.groupby(['segment','negative'])['spent'].describe().to_clipboard()