In [1]:
import pandas as pd
import numpy as np
import os

# =========================
# [Step 1] 데이터 로드
# =========================

customer = pd.read_csv('../../data/customer_hm.csv')
transactions = pd.read_csv('../../data/transactions_hm.csv')
articles = pd.read_csv('../../data/articles_hm.csv')

df_cust = customer.copy()
df_tran = transactions.copy()
df_art = articles.copy()

# =========================
# [Customer] 데이터 전처리
# =========================

# 1. fashion_news_frequency 결측치 처리
df_cust['fashion_news_frequency'] = (
    df_cust['fashion_news_frequency']
    .fillna('Regularly')
)

# 2. 문자형 → 정수형 매핑
status_map = {'ACTIVE': 2, 'PRE-CREATE': 1, 'LEFT CLUB': 0}
frequency_map = {'Regularly': 2, 'Monthly': 1, 'NONE': 0}

df_cust['club_member_status'] = df_cust['club_member_status'].map(status_map)
df_cust['fashion_news_frequency'] = df_cust['fashion_news_frequency'].map(frequency_map)

# 3. 연령대 파생변수 생성
def cate_age(age):
    if age < 20: return '10대'
    elif age < 30: return '20대'
    elif age < 40: return '30대'
    elif age < 50: return '40대'
    elif age < 60: return '50대'
    else: return '60대 이상'

df_cust['age_segment'] = df_cust['age'].apply(cate_age)

# =========================
# [Transactions] 데이터 전처리
# =========================

# 1. join을 위한 타입 통일
df_tran['customer_id'] = df_tran['customer_id'].astype(str)
df_cust['customer_id'] = df_cust['customer_id'].astype(str)

df_tran['article_id'] = df_tran['article_id'].astype(str).str.zfill(10)
df_art['article_id'] = df_art['article_id'].astype(str).str.zfill(10)

# 2. 중복 제거
# df_tran.drop_duplicates(inplace=True)

# 3. 날짜 변환 및 월 컬럼 생성
df_tran['t_dat'] = pd.to_datetime(df_tran['t_dat'], format='%Y-%m-%d')
df_tran['year_month'] = df_tran['t_dat'].dt.to_period('M')

# 4. 판매 채널 한글화
df_tran['channel'] = df_tran['sales_channel_id'].map({
    1: '오프라인',
    2: '온라인'
})

# =========================
# [Article] 데이터 전처리
# =========================

# 1. ID 형식 통일
df_art['article_id'] = df_art['article_id'].astype(str).str.zfill(10)

# 2. 결측치 처리
df_art['detail_desc'] = df_art['detail_desc'].fillna('No Description')

# 3. 불필요 컬럼 제거
cols_to_drop = [
    'product_type_no', 
    'graphical_appearance_no', 
    'colour_group_code', 
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_code',
    'index_group_no',
    'section_no',
    'garment_group_no'
]
df_art_cleaned = df_art.drop(columns=cols_to_drop)

# 4. 시즌 파생변수
def get_season(row):
    text = (str(row['section_name']) + " " + str(row['prod_name'])).lower()
    group = str(row['product_group_name']).lower()

    ss_keywords = [
        'swimwear', 'sport', 'shorts', 'sandals', 'sleeveless', 
        'tank', 'mini', 'beach', 'summer', 'sun', 'flip flop', 
        'skirt', 'dress'
    ]

    fw_keywords = [
        'outerwear', 'knitted', 'jacket', 'coat', 'hoodie',
        'sweatshirt', 'scarf', 'gloves', 'boots', 'heavy',
        'fur', 'wool', 'cardigan', 'winter'
    ]

    if any(kw in text for kw in ss_keywords) or group in ['swimwear']:
        return 'SS'
    elif any(kw in text for kw in fw_keywords):
        return 'FW'
    else:
        return 'All-Season'

df_art_cleaned['product_season'] = df_art_cleaned.apply(get_season, axis=1)

# 5. 메인 카테고리
df_art_cleaned['category_main'] = df_art_cleaned['index_group_name']

# 6. 저관여 / 고관여 전략
def get_involvement_strategy(row):
    garment = str(row['garment_group_name']).lower()
    section = str(row['section_name']).lower()

    if any(kw in garment for kw in ['basic', 'underwear', 'socks', 'jersey']) or 'basic' in section:
        return 'Low_Involvement_Basic'
    elif any(kw in garment for kw in ['knitwear', 'outerwear', 'dresses']) or \
        any(kw in section for kw in ['trend', 'special']):
        return 'High_Involvement_Strategic'
    else:
        return 'General_Fashion'

df_art_cleaned['product_strategy'] = df_art_cleaned.apply(get_involvement_strategy, axis=1)

# 7. 신상품 여부
def check_newness(row):
    text = (str(row['prod_name']) + " " + str(row['detail_desc'])).lower()
    new_keywords = ['new', 'collection', 'latest', 'trend', 'exclusive']

    if any(kw in text for kw in new_keywords):
        return 'New_Arrival'
    return 'Regular_Carryover'

df_art_cleaned['is_new'] = df_art_cleaned.apply(check_newness, axis=1)

# 8. 색상 톤 분류
def get_color_tone(color):
    color = str(color).lower()
    dark_colors = ['black', 'dark blue', 'dark grey', 'dark red', 'navy blue', 'dark green', 'anthracite']
    light_colors = ['white', 'light beige', 'off white', 'light pink', 'light blue', 'yellowish brown']

    if any(dc in color for dc in dark_colors):
        return 'Dark_Tone'
    elif any(lc in color for lc in light_colors):
        return 'Light_Tone'
    else:
        return 'Neutral_Tone'

df_art_cleaned['color_tone'] = df_art_cleaned['colour_group_name'].apply(get_color_tone)

# =========================
# 전처리 결과 저장
# =========================

OUTPUT_DIR = "../../data_fin"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df_cust.to_csv(f"{OUTPUT_DIR}/customers_clean.csv", index=False)
df_tran.to_csv(f"{OUTPUT_DIR}/transactions_clean.csv", index=False)
df_art_cleaned.to_csv(f"{OUTPUT_DIR}/articles_clean.csv", index=False)
