In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 불러오기
data = pd.read_csv('/content/drive/MyDrive/online_retail_preprocessed.csv', encoding='unicode_escape')

# 날짜 데이터 변환
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])

# InvoiceMonth 생성
data['InvoiceMonth'] = data['InvoiceDate'].dt.to_period('M')

# CohortMonth 계산
data['CohortMonth'] = data.groupby('CustomerID')['InvoiceMonth'].transform('min')

# Cohort Index 계산
invoice_year = data['InvoiceMonth'].dt.year
invoice_month = data['InvoiceMonth'].dt.month
cohort_year = data['CohortMonth'].dt.year
cohort_month = data['CohortMonth'].dt.month

data['CohortIndex'] = (invoice_year - cohort_year) * 12 + (invoice_month - cohort_month) + 1

# 코호트 데이터 집계
cohort_data = data.groupby(['CohortMonth', 'CohortIndex']).agg({'CustomerID': 'nunique'}).reset_index()

# 피벗 테이블 생성
cohort_pivot = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID')

# 유지율 계산
cohort_size = cohort_pivot.iloc[:, 0]
cohort_retention = cohort_pivot.divide(cohort_size, axis=0)

# 히트맵 시각화 및 저장
plt.figure(figsize=(12, 8))
sns.heatmap(cohort_retention, annot=True, fmt='.0%', cmap='YlGnBu')
plt.title('Cohort Analysis - Retention Rates', fontsize=16)
plt.xlabel('Cohort Index (Months)', fontsize=12)
plt.ylabel('Cohort Month', fontsize=12)
plt.tight_layout()

# 화면 저장
plt.savefig('cohort_analysis_screenshot.png')
plt.show()