In [3]:
# Case Study - Feature Engineering & Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Veriyi yükle
df = pd.read_csv('../data/sample_data.csv')

# Veri temizleme
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df = df[df['Quantity'] > 0]  # Sadece pozitif satışlar
df = df[df['Price'] > 0]     # Pozitif fiyatlar
df = df.dropna(subset=['Customer ID'])  # Customer ID eksik olanları çıkar

# Feature Engineering
print("🔧 FEATURE ENGİNEERİNG")
print("="*50)

# 1. Temel özellikler
df['TotalAmount'] = df['Quantity'] * df['Price']
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['DayOfWeek'] = df['InvoiceDate'].dt.dayofweek
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# 2. Sezon bilgisi
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

df['Season'] = df['Month'].apply(get_season)

# 3. Müşteri bazlı RFM Analizi
print("📊 RFM Analizi hesaplanıyor...")

# Reference date (en son tarihten 1 gün sonrası)
reference_date = df['InvoiceDate'].max() + timedelta(days=1)

# RFM hesaplama
customer_rfm = df.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,  # Recency
    'Invoice': 'nunique',  # Frequency
    'TotalAmount': 'sum'   # Monetary
}).reset_index()

customer_rfm.columns = ['Customer ID', 'Recency', 'Frequency', 'Monetary']

# 4. Ek müşteri özellikleri
customer_features = df.groupby('Customer ID').agg({
    'TotalAmount': ['sum', 'mean', 'count'],
    'Quantity': ['sum', 'mean'],
    'StockCode': 'nunique',
    'Country': lambda x: x.mode().iloc[0],
    'InvoiceDate': ['min', 'max']
}).reset_index()

# Flatten column names
customer_features.columns = ['Customer ID', 'Total_Spent', 'Avg_Order_Value', 'Order_Count',
                           'Total_Quantity', 'Avg_Quantity', 'Unique_Products', 'Country',
                           'First_Purchase', 'Last_Purchase']

# 5. Customer lifetime değerleri
customer_features['Customer_Lifespan_Days'] = (
    customer_features['Last_Purchase'] - customer_features['First_Purchase']
).dt.days

customer_features['Purchase_Frequency'] = (
    customer_features['Order_Count'] / (customer_features['Customer_Lifespan_Days'] + 1)
)

# RFM ile birleştir
customer_final = customer_rfm.merge(customer_features, on='Customer ID')

print(f"✅ {len(customer_final)} müşteri için özellikler oluşturuldu")
print("\nMüşteri özellikleri:")
print(customer_final.describe())

# 6. Ürün bazlı özellikler
print("\n🛍️ Ürün özellikleri hesaplanıyor...")

product_features = df.groupby('StockCode').agg({
    'Quantity': ['sum', 'mean', 'count'],
    'Price': ['mean', 'std'],
    'TotalAmount': 'sum',
    'Customer ID': 'nunique',
    'InvoiceDate': ['min', 'max']
}).reset_index()

# Flatten columns
product_features.columns = ['StockCode', 'Total_Sold', 'Avg_Quantity', 'Order_Count',
                          'Avg_Price', 'Price_Std', 'Total_Revenue', 'Unique_Customers',
                          'First_Sale', 'Last_Sale']

# Ürün popülerliği
product_features['Product_Lifespan_Days'] = (
    product_features['Last_Sale'] - product_features['First_Sale']
).dt.days

# Fiyat segmenti
price_quartiles = product_features['Avg_Price'].quantile([0.25, 0.75])
product_features['Price_Segment'] = pd.cut(
    product_features['Avg_Price'],
    bins=[0, price_quartiles[0.25], price_quartiles[0.75], float('inf')],
    labels=['Low', 'Medium', 'High']
)

print(f"✅ {len(product_features)} ürün için özellikler oluşturuldu")

# Örnek RFM skorları
def create_rfm_scores(df):
    df['R_Score'] = pd.qcut(df['Recency'], 5, labels=[5,4,3,2,1])
    df['F_Score'] = pd.qcut(df['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
    df['M_Score'] = pd.qcut(df['Monetary'], 5, labels=[1,2,3,4,5])

    df['RFM_Score'] = df['R_Score'].astype(str) + df['F_Score'].astype(str) + df['M_Score'].astype(str)
    return df

customer_final = create_rfm_scores(customer_final)

print("\n🎯 Örnek müşteri segmentleri:")
print(customer_final['RFM_Score'].value_counts().head(10))

# Dosyaları kaydet
customer_final.to_csv('../results/customer_features.csv', index=False)
product_features.to_csv('../results/product_features.csv', index=False)
df.to_csv('../results/processed_data.csv', index=False)

print("\n✅ Tüm özellikler oluşturuldu ve kaydedildi!")
print("📁 Sonuçlar 'results/' klasöründe")

🔧 FEATURE ENGİNEERİNG
📊 RFM Analizi hesaplanıyor...
✅ 1023 müşteri için özellikler oluşturuldu

Müşteri özellikleri:
        Customer ID      Recency    Frequency      Monetary   Total_Spent  \
count   1023.000000  1023.000000  1023.000000   1023.000000   1023.000000   
mean   15408.918866    27.022483     1.668622    836.149081    836.149081   
min    12346.000000     1.000000     1.000000      2.950000      2.950000   
25%    13997.000000    24.000000     1.000000    208.865000    208.865000   
50%    15409.000000    29.000000     1.000000    351.300000    351.300000   
75%    16879.000000    35.500000     2.000000    705.575000    705.575000   
max    18286.000000    41.000000    17.000000  55102.820000  55102.820000   
std     1673.299730    11.680955     1.608035   2841.191694   2841.191694   

       Avg_Order_Value  Order_Count  Total_Quantity  Avg_Quantity  \
count      1023.000000  1023.000000     1023.000000   1023.000000   
mean         39.119434    34.176931      485.410557