In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [4]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
online_retail = fetch_ucirepo(id=352) 
  
# data (as pandas dataframes) 
X = online_retail.data.features 
y = online_retail.data.targets 
  
# metadata 
print(online_retail.metadata) 
  
# variable information 
print(online_retail.variables) 

{'uci_id': 352, 'name': 'Online Retail', 'repository_url': 'https://archive.ics.uci.edu/dataset/352/online+retail', 'data_url': 'https://archive.ics.uci.edu/static/public/352/data.csv', 'abstract': 'This is a transactional data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate', 'Sequential', 'Time-Series'], 'num_instances': 541909, 'num_features': 6, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': None, 'index_col': ['InvoiceNo', 'StockCode'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2015, 'last_updated': 'Mon Oct 21 2024', 'dataset_doi': '10.24432/C5BW33', 'creators': ['Daqing Chen'], 'intro_paper': {'ID': 361, 'type': 'NATIVE', 'title': 'Data mining for the online retail industry: A case study of RFM model-based customer segmenta

In [6]:
df = pd.read_csv("포폴/Online Retail.csv")

df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 8:26,3.39,17850.0,United Kingdom


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [8]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


In [12]:
df = df[df['CustomerID'].notna()].copy()

In [13]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(str)

In [14]:
df = df[~df['InvoiceNo'].str.startswith('C')]

In [15]:
reference_date = df['InvoiceDate'].max()
reference_date

Timestamp('2011-12-09 12:50:00')

In [16]:
rfm = df.groupby('CustomerID').agg(
    last_purchase_date=('InvoiceDate', 'max'),
    frequency=('InvoiceNo', 'nunique'),
    monetary=('UnitPrice', lambda x: (x * df.loc[x.index, 'Quantity']).sum())
).reset_index()

In [17]:
rfm['recency'] = (reference_date - rfm['last_purchase_date']).dt.days

In [18]:
rfm['churn'] = (rfm['recency'] >= 60).astype(int)

In [19]:
final_df = rfm[['CustomerID', 'recency', 'frequency', 'monetary', 'churn']]
final_df.head()

Unnamed: 0,CustomerID,recency,frequency,monetary,churn
0,12346.0,325,1,77183.6,1
1,12347.0,1,7,4310.0,0
2,12348.0,74,4,1797.24,1
3,12349.0,18,1,1757.55,0
4,12350.0,309,1,334.4,1


In [20]:
final_df['churn'].value_counts(normalize=True)

final_df.groupby('churn')[['recency', 'frequency', 'monetary']].mean()

Unnamed: 0_level_0,recency,frequency,monetary
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22.154777,6.074259,3068.506625
1,177.133368,2.047374,801.33755


In [21]:
final_df['churn'].value_counts(normalize=True)

final_df.groupby('churn')[['recency', 'frequency', 'monetary']].mean()

Unnamed: 0_level_0,recency,frequency,monetary
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22.154777,6.074259,3068.506625
1,177.133368,2.047374,801.33755


In [22]:
X = final_df[['recency', 'frequency', 'monetary']]
y = final_df['churn']

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

y_pred_log = log_model.predict(X_test)
y_prob_log = log_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_log))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_log))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       388

    accuracy                           1.00       868
   macro avg       1.00      1.00      1.00       868
weighted avg       1.00      1.00      1.00       868

ROC-AUC: 1.0


In [25]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier(
    max_depth=4,
    min_samples_leaf=50,
    random_state=42
)
tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)
y_prob_tree = tree_model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_tree))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_tree))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       388

    accuracy                           1.00       868
   macro avg       1.00      1.00      1.00       868
weighted avg       1.00      1.00      1.00       868

ROC-AUC: 1.0


In [26]:
coef_df = pd.DataFrame({
    'feature': X.columns,
    'coefficient': log_model.coef_[0]
}).sort_values(by='coefficient', ascending=False)

coef_df

Unnamed: 0,feature,coefficient
0,recency,3.34745
2,monetary,-0.000255
1,frequency,-0.073626


1. recency (+) -> 오래 안 산 고객일수록 이탈
2. frequency (-) -> 자주 산 고객일수록 유지
3. monetary (-) -> 돈 많이 쓴 고객일수록 유지

In [28]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': tree_model.feature_importances_
}).sort_values(by='importance', ascending=False)

importance_df

Unnamed: 0,feature,importance
0,recency,1.0
1,frequency,0.0
2,monetary,0.0


In [29]:
final_df['churn_prob'] = log_model.predict_proba(X)[:, 1]

In [30]:
high_risk = final_df[
    (final_df['churn_prob'] > 0.7) &
    (final_df['monetary'] > final_df['monetary'].median())
]

high_risk.head()

Unnamed: 0,CustomerID,recency,frequency,monetary,churn,churn_prob
0,12346.0,325,1,77183.6,1,1.0
2,12348.0,74,4,1797.24,1,1.0
7,12354.0,231,1,1079.4,1,1.0
22,12372.0,71,3,1298.04,1,1.0
26,12377.0,314,2,1628.12,1,1.0
