## Import Modules

In [1]:
# Set paths
import os
import random
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
import datetime as dt

# Geolocation
import geonamescache

# Custom package for data preprocessing
import preprocessing as pp

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

### RFM Analysis

In [3]:
%%time
df_rfm = pp.rfm_analysis(filepaths.processed_produce_customer_segments)

Wall time: 31.5 s


### Metrics

In [7]:
purchase_frequency_distribution = df_rfm.groupby(['frequency'], as_index=False).agg({'CustomerID':'count'})
purchase_frequency_distribution

Unnamed: 0,frequency,CustomerID
0,1,8396
1,2,15688
2,3,20208
3,4,19323
4,5,14782
5,6,9282
6,7,4953
7,8,2364
8,9,1000
9,10,407


In [8]:
purchase_amount_per_frequency_distribution = df_rfm.groupby(['frequency'], as_index=False).agg({'monetary':['sum', 'mean'] })
purchase_amount_per_frequency_distribution

Unnamed: 0_level_0,frequency,monetary,monetary
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
0,1,1852318.7,220.62
1,2,6965102.93,443.98
2,3,13374649.24,661.85
3,4,17171511.37,888.66
4,5,16338547.78,1105.3
5,6,12333300.44,1328.73
6,7,7642159.87,1542.94
7,8,4101236.72,1734.87
8,9,2014311.53,2014.31
9,10,874361.48,2148.31


### Descriptive Statistics

In [21]:
df_rfm['customer_segment'].value_counts(dropna=False, normalize=True)

other                  0.59
big spender            0.18
lost cheap customers   0.08
loyal customers        0.08
best customers         0.05
almost lost            0.02
lost customers         0.01
Name: customer_segment, dtype: float64

In [18]:
df_rfm['Gender'].value_counts(dropna=False, normalize=True)

F   0.51
M   0.49
Name: Gender, dtype: float64

In [11]:
df_rfm['CustomerCityName'].nunique()

96

In [12]:
df_rfm['CustomerState'].nunique()

37

In [15]:
df_rfm['CustomerRegion'].value_counts(dropna=False, normalize=True)

South       0.45
West        0.25
Midwest     0.22
Northeast   0.08
Name: CustomerRegion, dtype: float64

In [16]:
df_rfm['CustomerDivision'].value_counts(dropna=False, normalize=True)

West South Central   0.20
Pacific              0.17
South Atlantic       0.15
East North Central   0.11
East South Central   0.10
West North Central   0.10
Mountain             0.08
Middle Atlantic      0.07
New England          0.01
Name: CustomerDivision, dtype: float64