In [None]:
# Импорт библиотек
import numpy as np
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import logging
warnings.filterwarnings('ignore')

# Настройка логирования
logging.basicConfig(filename="segmentation.log", level=logging.INFO)
np.random.seed(42)

# 1. Загрузка данных
file_path = './DECENTRATHON_3.0.parquet'
try:
    df = pd.read_parquet(file_path, columns=[
        'card_id', 'transaction_timestamp', 'transaction_amount_kzt', 'merchant_mcc',
        'transaction_type', 'acquirer_country_iso', 'merchant_city'
    ])
    print(f"Данные загружены из {file_path}, размер: {df.shape}")
except FileNotFoundError:
    print(f"Файл {file_path} не найден.")
    raise

# Проверка диапазона дат
min_date = df['transaction_timestamp'].min()
max_date = df['transaction_timestamp'].max()
print(f"Минимальная дата: {min_date}")
print(f"Максимальная дата: {max_date}")

# Устанавливаем базовую дату как максимальную дату в датасете
current_datetime = max_date
print(f"Базовая дата для расчета: {current_datetime}")

# 2. Расчет метрик
metrics = []
for card_id, group in df.groupby('card_id'):
    last_tx = group['transaction_timestamp'].max()
    recency_days = (current_datetime - last_tx).days if pd.notnull(last_tx) else 365

    # Транзакции за последние 90 дней
    tx_0_90 = group[group['transaction_timestamp'] >= current_datetime - timedelta(days=90)]
    sum_0_90 = tx_0_90['transaction_amount_kzt'].sum()
    count_0_90 = len(tx_0_90)
    avg_check_0_90 = sum_0_90 / count_0_90 if count_0_90 > 0 else 0

    # Транзакции за предыдущие 90-180 дней
    tx_90_180 = group[(group['transaction_timestamp'] >= current_datetime - timedelta(days=180)) &
                      (group['transaction_timestamp'] < current_datetime - timedelta(days=90))]
    sum_90_180 = tx_90_180['transaction_amount_kzt'].sum()
    count_90_180 = len(tx_90_180)
    avg_check_90_180 = sum_90_180 / count_90_180 if count_90_180 > 0 else 0

    total_amount = group['transaction_amount_kzt'].sum()

    # Доли MCC категорий
    mcc_share_travel = group[group['merchant_mcc'].between(3000, 3299)]['transaction_amount_kzt'].sum() / total_amount if total_amount > 0 else 0
    mcc_share_gourmet = group[group['merchant_mcc'].isin([5812, 5814])]['transaction_amount_kzt'].sum() / total_amount if total_amount > 0 else 0
    mcc_share_fashion = group[group['merchant_mcc'].isin([5611, 5621, 5631, 5641, 5651, 5661, 5691, 5699])]['transaction_amount_kzt'].sum() / total_amount if total_amount > 0 else 0
    mcc_share_home = group[group['merchant_mcc'].isin([5200, 5211, 5231, 5251])]['transaction_amount_kzt'].sum() / total_amount if total_amount > 0 else 0
    mcc_share_tech = group[group['merchant_mcc'].between(5732, 5734)]['transaction_amount_kzt'].sum() / total_amount if total_amount > 0 else 0
    mcc_share_gaming = group[group['merchant_mcc'] == 7994]['transaction_amount_kzt'].sum() / total_amount if total_amount > 0 else 0

    p2p_count = len(group[group['transaction_type'] == 'P2P'])
    p2p_share = p2p_count / len(group) if len(group) > 0 else 0
    kaz_count = len(group[group['acquirer_country_iso'] == 'KAZ'])
    acquirer_country_iso_share = kaz_count / len(group) if len(group) > 0 else 0
    merchant_city_unique = group['merchant_city'].nunique()

    metrics.append({
        'card_id': card_id,
        'recency_days': recency_days,
        'sum_0_90': sum_0_90,
        'sum_90_180': sum_90_180,
        'avg_check_0_90': avg_check_0_90,
        'avg_check_90_180': avg_check_90_180,
        'mcc_share_travel': mcc_share_travel,
        'mcc_share_gourmet': mcc_share_gourmet,
        'mcc_share_fashion': mcc_share_fashion,
        'mcc_share_home': mcc_share_home,
        'mcc_share_tech': mcc_share_tech,
        'mcc_share_gaming': mcc_share_gaming,
        'p2p_share': p2p_share,
        'acquirer_country_iso_share': acquirer_country_iso_share,
        'merchant_city_unique': merchant_city_unique
    })

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('metrics.csv', index=False)
print("Метрики сохранены в metrics.csv")

# 3. Присвоение атрибутов
def assign_attributes(row):
    # Geo
    if row['acquirer_country_iso_share'] >= 0.9:
        geo = "Национальный"
    elif row['acquirer_country_iso_share'] < 0.5:
        geo = "Интернациональный"
    elif row['merchant_city_unique'] <= 2:
        geo = "Местный"
    else:
        geo = "Неопределённый"

    # Behavior
    if row['mcc_share_gaming'] > 0.1 or row['p2p_share'] > 0.3:
        behavior = "Рискованный"
    elif row['recency_days'] <= 30 and row['sum_0_90'] > row['sum_90_180'] and row['sum_0_90'] > 0:
        behavior = "Звездный"
    else:
        behavior = "Стандартный"

    # Category
    shares = {
        'Любитель путешествий': row['mcc_share_travel'],
        'Гастроном': row['mcc_share_gourmet'],
        'Модник': row['mcc_share_fashion'],
        'Домосед': row['mcc_share_home'],
        'Техногик': row['mcc_share_tech'],
        'Геймер': row['mcc_share_gaming']
    }
    max_share = max(shares.values())
    category = max(shares, key=shares.get) if max_share >= 0.25 else "Стандартный"

    return geo, behavior, category

metrics_df['geo_attribute'], metrics_df['behavior_attribute'], metrics_df['category_attribute'] = zip(*metrics_df.apply(assign_attributes, axis=1))

# 4. Присвоение сегментов
def assign_segment(row):
    if row['sum_0_90'] == 0 and row['sum_90_180'] == 0:
        return "Остановившийся"
    elif row['sum_0_90'] == 0 and row['sum_90_180'] > 0:
        return "Понижающийся"
    elif row['sum_0_90'] > row['sum_90_180'] * 1.2 and row['sum_0_90'] > 0:
        return "Повышающийся"
    elif row['sum_90_180'] > 0 and abs(row['sum_0_90'] - row['sum_90_180']) / row['sum_90_180'] <= 0.2:
        return "Стабильный"
    else:
        return "Неопределённый"

metrics_df['segment_attribute'] = metrics_df.apply(assign_segment, axis=1)

# 5. Расчет процентов
attribute_stats = {
    'geo': metrics_df['geo_attribute'].value_counts(normalize=True) * 100,
    'behavior': metrics_df['behavior_attribute'].value_counts(normalize=True) * 100,
    'category': metrics_df['category_attribute'].value_counts(normalize=True) * 100,
    'segment': metrics_df['segment_attribute'].value_counts(normalize=True) * 100
}

logging.info("Geo: %s", attribute_stats['geo'].to_dict())
logging.info("Behavior: %s", attribute_stats['behavior'].to_dict())
logging.info("Category: %s", attribute_stats['category'].to_dict())
logging.info("Segment: %s", attribute_stats['segment'].to_dict())

attribute_stats_df = pd.DataFrame({
    'Атрибут': (['geo_' + k for k in attribute_stats['geo'].index] +
                ['behavior_' + k for k in attribute_stats['behavior'].index] +
                ['category_' + k for k in attribute_stats['category'].index] +
                ['segment_' + k for k in attribute_stats['segment'].index]),
    'Процент': (list(attribute_stats['geo'].values) +
                list(attribute_stats['behavior'].values) +
                list(attribute_stats['category'].values) +
                list(attribute_stats['segment'].values))
})
attribute_stats_df.to_csv('attribute_stats.csv', index=False)
print("Процентное распределение сохранено в attribute_stats.csv")
print(attribute_stats_df)

# 6. Графики
def plot_and_save_distribution(data, column, title, filename):
    plt.figure(figsize=(10, 6))
    sns.countplot(data=data, x=column, order=data[column].value_counts().index)
    plt.title(title)
    plt.xlabel(column.replace('_attribute', '').capitalize())
    plt.ylabel('Количество клиентов')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"График сохранен как {filename}")

plot_and_save_distribution(metrics_df, 'geo_attribute', 'Распределение Geo', 'geo_distribution.png')
plot_and_save_distribution(metrics_df, 'behavior_attribute', 'Распределение Behavior', 'behavior_distribution.png')
plot_and_save_distribution(metrics_df, 'category_attribute', 'Распределение Category', 'category_distribution.png')
plot_and_save_distribution(metrics_df, 'segment_attribute', 'Распределение Segments', 'segment_distribution.png')

# 7. Сохранение результатов
result_df = metrics_df[['card_id', 'geo_attribute', 'behavior_attribute', 'category_attribute', 'segment_attribute']]
result_df.to_parquet("segmented.parquet", index=False)
print("Результаты сохранены в segmented.parquet")
print("\nПервые 5 строк результатов:")
print(result_df.head())