In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from collections import deque
import matplotlib.pyplot as plt
import os
import gc

POSTBACKS_PATH = "/content/drive/MyDrive/TDS/Postbacks.csv"
CLICKS_PATH = "/content/drive/MyDrive/TDS/Clicks.csv"
SOURCES_PATH = "/content/drive/MyDrive/TDS/Source.csv"
OFFERS_PATH = "/content/drive/MyDrive/TDS/Offers.csv"

OUTPUT_SELECTION_HISTORY = "selection_history.csv"
KPI_MONITORING_PATH = "kpi_monitoring.csv"
ALERTS_LOG_PATH = "alerts_log.csv"


print("[INFO] Завантаження даних...")
postbacks = pd.read_csv(POSTBACKS_PATH).sample(frac=1, random_state=42)
clicks = pd.read_csv(CLICKS_PATH).sample(frac=1, random_state=42)
sources = pd.read_csv(SOURCES_PATH)
offers = pd.read_csv(OFFERS_PATH)

print(f"[INFO] Rows: postbacks={len(postbacks)}, clicks={len(clicks)}, sources={len(sources)}, offers={len(offers)}")

for df, col in [(clicks, 'click_timestamp'), (postbacks, 'postback_timestamp')]:
    if col in df.columns and not np.issubdtype(df[col].dtype, np.datetime64):
        df[col] = pd.to_datetime(df[col], errors='coerce')

postbacks = postbacks[postbacks['click_id'].isin(clicks['click_id'])].copy()

if 'os_type' in clicks.columns and 'os' not in clicks.columns:
    clicks['os'] = clicks['os_type']

if 'user_id' in clicks.columns and 'click_number' in clicks.columns:
    clicks['unique_key'] = clicks['user_id'].astype(str) + '_' + clicks['click_number'].astype(str)
else:
    clicks['unique_key'] = clicks['click_id']

if 'user_id' in clicks.columns and 'click_timestamp' in clicks.columns:
    clicks = clicks.sort_values(['user_id', 'click_timestamp'])
    clicks['time_diff'] = clicks.groupby('user_id')['click_timestamp'].diff().dt.total_seconds()
    clicks = clicks[clicks['time_diff'].isna() | (clicks['time_diff'] >= 1)].drop('time_diff', axis=1)

clicks = clicks.merge(sources, on='source_id', how='left')
clicks = clicks.merge(offers, on='offer_id', how='left')

epc_df = postbacks.groupby('offer_id')['revenue'].mean().reset_index().rename(columns={'revenue': 'EPC'})
clicks = clicks.merge(epc_df, on='offer_id', how='left').fillna({'EPC': 0})
baseline_epc = epc_df['EPC'].mean()

cr_series = postbacks.groupby('offer_id').size() / clicks.groupby('offer_id').size()
cr_df = cr_series.reset_index().rename(columns={0: 'CR'}).fillna(0)
clicks = clicks.merge(cr_df, on='offer_id', how='left').fillna({'CR': 0})

categorical_cols = ['browser', 'placement', 'device_type', 'geo', 'os', 'network', 'payout_type']
encoders = {}
for col in categorical_cols:
    if col in clicks.columns:
        le = LabelEncoder()
        clicks[col] = le.fit_transform(clicks[col].astype(str))
        encoders[col] = le


if 'remaining_daily_cap' not in offers.columns:
    offers['remaining_daily_cap'] = offers['daily_cap_amount'].fillna(1e12)
if 'remaining_total_cap' not in offers.columns:
    offers['remaining_total_cap'] = offers['total_cap_amount'].fillna(1e12)


print("[INFO] Тренування RandomForest...")
features = [c for c in ['browser', 'placement', 'device_type', 'geo', 'os', 'network', 'payout_type', 'EPC', 'CR'] if c in clicks.columns]
clicks['target'] = clicks['offer_id'].isin(postbacks['offer_id']).astype(int)

X = clicks[features]
y = clicks['target']

model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X, y)
print(f"[INFO] Модель натренована. Позитивних прикладів: {y.sum()}")
gc.collect()

epc_window = deque(maxlen=50)
cr_window = deque(maxlen=50)
ctr_window = deque(maxlen=50)


def monitor_kpi(clicks_df, postbacks_df, current_date):
    daily_clicks = clicks_df[clicks_df['click_timestamp'].dt.date == current_date]
    daily_postbacks = postbacks_df[postbacks_df['postback_timestamp'].dt.date == current_date]

    clicks_count = len(daily_clicks)
    conversions = len(daily_postbacks)
    impressions = clicks_count * 10  # 1 клік = 10 показів
    cr = (conversions / clicks_count * 100) if clicks_count > 0 else 0
    ctr = (clicks_count / impressions * 100) if impressions > 0 else 0
    revenue = daily_postbacks['revenue'].sum()
    epc = revenue / clicks_count if clicks_count > 0 else 0

    kpi = {
        'date': current_date,
        'clicks': clicks_count,
        'conversions': conversions,
        'impressions': impressions,
        'cr_percent': cr,
        'ctr_percent': ctr,
        'epc': epc,
        'revenue': revenue
    }

    # Запис у файл
    pd.DataFrame([kpi]).to_csv(KPI_MONITORING_PATH, mode='a', header=not os.path.exists(KPI_MONITORING_PATH), index=False)
    print(f"[KPI] Date: {current_date}, Clicks: {clicks_count}, Impressions: {impressions}, CR: {cr:.2f}%, CTR: {ctr:.2f}%, EPC: {epc:.4f}")

    epc_window.append(epc)
    cr_window.append(cr)
    ctr_window.append(ctr)

    return kpi

def check_anomalies(clicks_df, postbacks_df, baseline_epc, current_date):
    alerts = []
    daily_clicks = clicks_df[clicks_df['click_timestamp'].dt.date == current_date]
    daily_postbacks = postbacks_df[postbacks_df['postback_timestamp'].dt.date == current_date]

    clicks_per_hour = daily_clicks.groupby(daily_clicks['click_timestamp'].dt.hour).size()
    mean_clicks = clicks_per_hour.mean()
    std_clicks = clicks_per_hour.std()
    threshold = mean_clicks + 3 * std_clicks if std_clicks > 0 else mean_clicks * 2
    for hour, count in clicks_per_hour.items():
        if count > threshold:
            alerts.append({'timestamp': datetime.utcnow(), 'type': 'traffic_spike', 'details': f'Hour {hour}: {count} clicks > {threshold:.0f}'})

    clicks_count = len(daily_clicks)
    revenue = daily_postbacks['revenue'].sum()
    epc = revenue / clicks_count if clicks_count > 0 else 0
    if epc < baseline_epc * 0.8 and clicks_count > 100:
        alerts.append({'timestamp': datetime.utcnow(), 'type': 'low_epc', 'details': f'EPC {epc:.4f} < {baseline_epc * 0.8:.4f} (-20%)'})

    conversions = len(daily_postbacks)
    cr = (conversions / clicks_count * 100) if clicks_count > 0 else 0
    if cr > 50 and clicks_count > 100:
        alerts.append({'timestamp': datetime.utcnow(), 'type': 'high_cr', 'details': f'CR {cr:.2f}% > 50%'})

    if 'ip_address' in daily_clicks.columns:
        ip_counts = daily_clicks.groupby('ip_address').size()
        for ip, count in ip_counts.items():
            if count > 100:
                alerts.append({'timestamp': datetime.utcnow(), 'type': 'suspicious_ip', 'details': f'IP {ip}: {count} clicks'})

    if alerts:
        pd.DataFrame(alerts).to_csv(ALERTS_LOG_PATH, mode='a', header=not os.path.exists(ALERTS_LOG_PATH), index=False)
        for alert in alerts:
            print(f"[ALERT] {alert['type']}: {alert['details']}")
            if alert['type'] == 'suspicious_ip':
                print(f"[ACTION] Blocking IP: {alert['details'].split(':')[0].split(' ')[1]}")
            elif alert['type'] == 'traffic_spike':
                print("[ACTION] Investigate traffic source for potential fraud")
            elif alert['type'] in ['low_epc', 'high_cr']:
                print("[ACTION] Review offer performance and traffic quality")

    return alerts

def plot_kpi_history():
    plt.figure(figsize=(12, 8))

    plt.subplot(3, 1, 1)
    plt.plot(range(len(epc_window)), epc_window, label='EPC', color='green')
    plt.axhline(y=baseline_epc * 0.8, color='red', linestyle='--', label='EPC Alert Threshold (-20%)')
    plt.title('EPC (Last 50 clicks)')
    plt.legend()

    plt.subplot(3, 1, 2)
    plt.plot(range(len(cr_window)), cr_window, label='CR (%)', color='blue')
    plt.axhline(y=50, color='red', linestyle='--', label='CR Alert Threshold (50%)')
    plt.title('Conversion Rate (CR)')
    plt.legend()

    plt.subplot(3, 1, 3)
    plt.plot(range(len(ctr_window)), ctr_window, label='CTR (%)', color='orange')
    plt.title('Click-Through Rate (CTR)')
    plt.legend()

    plt.tight_layout()
    plt.show()

def update_caps_dynamic(offers_df, clicks_df, postbacks_df, current_date):
    offers_df = offers_df.copy()
    daily_clicks = clicks_df[clicks_df['click_timestamp'].dt.date == current_date] if not clicks_df.empty else pd.DataFrame()
    daily_pbs = postbacks_df[postbacks_df['postback_timestamp'].dt.date == current_date] if not postbacks_df.empty else pd.DataFrame()

    ppc_mask = offers_df['payout_type'] == 'PPC'
    if ppc_mask.any():
        ppc_offer_ids = offers_df.loc[ppc_mask, 'offer_id'].unique()
        daily_click_counts = daily_clicks[daily_clicks['offer_id'].isin(ppc_offer_ids)].groupby('offer_id').size() if not daily_clicks.empty else pd.Series()
        total_click_counts = clicks_df[clicks_df['offer_id'].isin(ppc_offer_ids)].groupby('offer_id').size() if not clicks_df.empty else pd.Series()
        for oid in ppc_offer_ids:
            dcount = daily_click_counts.get(oid, 0)
            tcount = total_click_counts.get(oid, 0)
            daily_cap = offers_df.loc[offers_df['offer_id'] == oid, 'daily_cap_amount'].fillna(1e12).values[0]
            total_cap = offers_df.loc[offers_df['offer_id'] == oid, 'total_cap_amount'].fillna(1e12).values[0]
            offers_df.loc[offers_df['offer_id'] == oid, 'remaining_daily_cap'] = max(0, daily_cap - dcount)
            offers_df.loc[offers_df['offer_id'] == oid, 'remaining_total_cap'] = max(0, total_cap - tcount)

    ppl_mask = offers_df['payout_type'].isin(['PPL', 'PPS'])
    if ppl_mask.any():
        ppl_offer_ids = offers_df.loc[ppl_mask, 'offer_id'].unique()
        daily_conv_counts = daily_pbs[daily_pbs['offer_id'].isin(ppl_offer_ids)].groupby('offer_id').size() if not daily_pbs.empty else pd.Series()
        total_conv_counts = postbacks_df[postbacks_df['offer_id'].isin(ppl_offer_ids)].groupby('offer_id').size() if not postbacks_df.empty else pd.Series()
        for oid in ppl_offer_ids:
            dconv = daily_conv_counts.get(oid, 0)
            tconv = total_conv_counts.get(oid, 0)
            daily_cap = offers_df.loc[offers_df['offer_id'] == oid, 'daily_cap_amount'].fillna(1e12).values[0]
            total_cap = offers_df.loc[offers_df['offer_id'] == oid, 'total_cap_amount'].fillna(1e12).values[0]
            offers_df.loc[offers_df['offer_id'] == oid, 'remaining_daily_cap'] = max(0, daily_cap - dconv)
            offers_df.loc[offers_df['offer_id'] == oid, 'remaining_total_cap'] = max(0, total_cap - tconv)

    gc.collect()
    return offers_df

def choose_offer(click_row, epsilon=0.1, offers_df=offers):
    is_split = click_row.get('is_split_offer', 0)
    offers_df = update_caps_dynamic(offers_df, clicks, postbacks, click_row['click_timestamp'].date())
    if is_split == 0:
        possible_offers = offers_df[(offers_df['is_backfill'] == 0) & (offers_df['remaining_daily_cap'] > 0) & (offers_df['remaining_total_cap'] > 0)]
    else:
        possible_offers = offers_df[(offers_df['remaining_daily_cap'] > 0) & (offers_df['remaining_total_cap'] > 0)]

    if possible_offers.empty:
        possible_offers = offers_df[(offers_df['is_backfill'] == 1) & (offers_df['remaining_daily_cap'] > 0) & (offers_df['remaining_total_cap'] > 0)]

    if possible_offers.empty:
        return None, {'reason': 'no_offers'}

    possible_offers = possible_offers.copy()
    input_data = []
    for _, off in possible_offers.iterrows():
        row_data = click_row[features].copy()
        row_data['EPC'] = epc_df.loc[epc_df['offer_id'] == off['offer_id'], 'EPC'].mean() if off['offer_id'] in epc_df['offer_id'].values else 0
        row_data['CR'] = cr_df.loc[cr_df['offer_id'] == off['offer_id'], 'CR'].mean() if off['offer_id'] in cr_df['offer_id'].values else 0
        input_data.append(row_data)

    input_df = pd.DataFrame(input_data, columns=features)
    preds = model.predict_proba(input_df)[:, 1]
    possible_offers['score'] = preds

    if np.random.rand() > epsilon:
        best_offer = possible_offers.loc[possible_offers['score'].idxmax()]
    else:
        top3 = possible_offers.nlargest(3, 'score')
        best_offer = top3.sample(1).iloc[0]

    selected_offer_id = best_offer['offer_id']

    idx_offer = offers.index[offers['offer_id'] == selected_offer_id]
    if not idx_offer.empty:
        idx_offer = idx_offer[0]
        if best_offer['payout_type'] == 'PPC':
            offers.at[idx_offer, 'remaining_daily_cap'] = max(0, offers.at[idx_offer, 'remaining_daily_cap'] - 1)
            offers.at[idx_offer, 'remaining_total_cap'] = max(0, offers.at[idx_offer, 'remaining_total_cap'] - 1)
        else:
            had_pb = (postbacks['click_id'] == click_row['click_id']).any()
            if had_pb:
                offers.at[idx_offer, 'remaining_daily_cap'] = max(0, offers.at[idx_offer, 'remaining_daily_cap'] - 1)
                offers.at[idx_offer, 'remaining_total_cap'] = max(0, offers.at[idx_offer, 'remaining_total_cap'] - 1)

    info = {
        'pool_type': 'prioritized' if is_split == 0 else 'active' if not possible_offers['is_backfill'].all() else 'backfill',
        'score': best_offer['score'],
        'epsilon': epsilon
    }
    rec = {
        'timestamp': datetime.utcnow(),
        'click_id': click_row['click_id'],
        'assigned_offer': selected_offer_id,
        'pool_type': info['pool_type'],
        'score': info['score']
    }
    pd.DataFrame([rec]).to_csv(OUTPUT_SELECTION_HISTORY, mode='a', header=not os.path.exists(OUTPUT_SELECTION_HISTORY), index=False)

    return selected_offer_id, info

def run_simulation(sample_size=1, epsilon=0.1):
    sampled_clicks = clicks.sample(sample_size, random_state=42).reset_index(drop=True)
    current_date = datetime(year=2024, month=10, day=17).date()

    for i, click_row in sampled_clicks.iterrows():
        chosen_offer, info = choose_offer(click_row, epsilon)

        monitor_kpi(clicks, postbacks, current_date)
        check_anomalies(clicks, postbacks, baseline_epc, current_date)

    plot_kpi_history()

run_simulation()


In [None]:
def run_simulation(sample_size=1, epsilon=0.1):
    sampled_clicks = clicks.sample(sample_size, random_state=42).reset_index(drop=True)
    current_date = datetime(year=2024, month=10, day=17).date()

    for i, click_row in sampled_clicks.iterrows():
        chosen_offer, info = choose_offer(click_row, epsilon)

        print(click_row,chosen_offer, info)

run_simulation()