In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# --------------------------------------------------
# 1. Load and Merge Data (using bidid as key)
# --------------------------------------------------
def load_and_merge_data(bid_file, imp_file, click_file, conv_file):
    col_list = [
        "bidid",
        "Timestamp",
        "LogType",
        "VisitorID",
        "User-Agent",
        "IP",
        "Region",
        "City",
        "AdExchange",
        "Domain",
        "URL",
        "AnonymousURLID",
        "AdslotID",
        "Adslotwidth",
        "Adslotheight",
        "Adslotvisibility",
        "Adslotformat",
        "Adslotfloorprice",
        "CreativeID",
        "BiddingPrice",
        "PayingPrice",
        "KeyPageURL",
        "AdvertiserID",
        "UserProfileTags"
    ]

    bidRequest_cols = [
        "bidid",
        "Timestamp",
        "iPinYouID",
        "User-Agent",
        "IP",
        "Region",
        "City",
        "AdExchange",
        "Domain",
        "URL",
        "AnonymousURLID",
        "AdslotID",
        "Adslotwidth",
        "Adslotheight",
        "Adslotvisibility",
        "Adslotformat",
        "Adslotfloorprice",
        "CreativeID",
        "BiddingPrice",
        "AdvertiserID",
        "UserProfileTags"
    ]
    # Load separate CSV files
    bids = pd.read_csv(bid_file, sep='\t', header=None, names=bidRequest_cols)
    imps = pd.read_csv(imp_file, sep='\t', header=None, names=col_list)
    clicks = pd.read_csv(click_file, sep='\t', header=None, names=col_list)
    convs = pd.read_csv(conv_file, sep='\t', header=None, names=col_list)
    
    # Merge the datasets on the common key "bidid"
    data = pd.merge(bids, imps, on='bidid', how='left')
    data = pd.merge(data, clicks, on='bidid', how='left')
    data = pd.merge(data, convs, on='bidid', how='left')
    
    # Fill missing click/conversion values (assume missing means no click/conversion)
    data['click'] = data['click'].fillna(0)
    data['conversion'] = data['conversion'].fillna(0)
    
    # Ensure market_price is available; if not, use bid_price as a fallback
    if 'market_price' not in data.columns or data['market_price'].isna().all():
        data['market_price'] = data.get('bid_price', 0)
    else:
        data['market_price'] = data['market_price'].fillna(0)
    
    return data

In [17]:
# --------------------------------------------------
# 2. Preprocessing for CTR Prediction
# --------------------------------------------------
def preprocess_for_ctr(data):
    """
    Prepare features and target for CTR prediction.
    Here we assume that aside from the key columns,
    all other columns are used as features.
    Adjust this function as needed.
    """
    # Drop columns not used as features
    # (bidid, click, conversion, market_price are not features)
    features = data.drop(['bidid', 'click', 'conversion', 'market_price'], axis=1, errors='ignore')
    target = data['click']
    return features, target

In [18]:
# --------------------------------------------------
# 3. Train a Simple CTR Prediction Model
# --------------------------------------------------
def train_ctr_model(X_train, y_train):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    return model

In [19]:
# --------------------------------------------------
# 4. Bidding Simulation
# --------------------------------------------------
def simulate_bidding(test_data, model, budget, base_bid, n, avg_ctr):
    """
    Simulate the bidding process over the test dataset.
    
    Parameters:
      - test_data: DataFrame containing features, click, conversion, and market_price.
      - model: Trained CTR model.
      - budget: Total available budget.
      - base_bid: Base bid for an impression with average CTR.
      - n: Conversion weight (KPI = clicks + n * conversions).
      - avg_ctr: Average CTR from training (for bid scaling).
      
    Returns:
      - total_kpi: Sum of clicks + n*conversions.
      - total_clicks: Total clicks achieved.
      - total_conversions: Total conversions achieved.
      - total_cost: Total spend.
    """
    total_cost = 0.0
    total_clicks = 0
    total_conversions = 0
    total_kpi = 0

    # If available, sort by timestamp to simulate the order of bid requests.
    if 'timestamp' in test_data.columns:
        test_data = test_data.sort_values(by='timestamp')
    
    # Process each bid request
    for _, row in test_data.iterrows():
        if total_cost >= budget:
            break

        # Prepare feature vector (exclude key columns)
        features = row.drop(labels=['bidid', 'click', 'conversion', 'market_price'])
        X_row = features.values.reshape(1, -1)
        
        # Predict click probability (pCTR)
        pctr = model.predict_proba(X_row)[0, 1]
        
        # Define a linear bidding strategy: bid scales with pCTR relative to average CTR
        bid = base_bid * (pctr / avg_ctr) if avg_ctr > 0 else base_bid

        # Win the auction if the bid exceeds the market price
        if bid >= row['market_price']:
            cost = row['market_price']
            if total_cost + cost > budget:
                break
            total_cost += cost
            
            # Update outcomes based on ground-truth click and conversion data
            total_clicks += row['click']
            total_conversions += row['conversion']
            total_kpi += row['click'] + n * row['conversion']
    
    return total_kpi, total_clicks, total_conversions, total_cost

In [20]:
# --------------------------------------------------
# 5. Main Routine
# --------------------------------------------------
def main():
    # File paths for your data files
    bid_file = '/Users/vishrutgrover/coding/Adobe Devcraft PS/bidding-devcraft/dataset/bid.06.txt'
    imp_file = '/Users/vishrutgrover/coding/Adobe Devcraft PS/bidding-devcraft/dataset/imp.06.txt'
    click_file = '/Users/vishrutgrover/coding/Adobe Devcraft PS/bidding-devcraft/dataset/clk.06.txt'
    conv_file = '/Users/vishrutgrover/coding/Adobe Devcraft PS/bidding-devcraft/dataset/conv.06.txt'
    
    # Load and merge the data using bidid as the join key
    data = load_and_merge_data(bid_file, imp_file, click_file, conv_file)
    
    # Split into training and testing sets
    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
    
    # Preprocess training data for CTR prediction
    X_train, y_train = preprocess_for_ctr(train_data)
    ctr_model = train_ctr_model(X_train, y_train)
    
    # Calculate average CTR for scaling bids
    avg_ctr = y_train.mean()
    print(f"Average CTR (training): {avg_ctr:.4f}")
    
    # Set bidding parameters
    budget = 10000     # Total budget (same unit as market_price)
    base_bid = 100     # Base bid for an average CTR impression
    n = 5              # Conversion weight (KPI = clicks + 5*conversions)
    
    # Run bidding simulation on the test data
    kpi, clicks, conversions, cost = simulate_bidding(test_data, ctr_model, budget, base_bid, n, avg_ctr)
    
    print("Simulation Results:")
    print(f"  Total KPI (clicks + {n}*conversions): {kpi}")
    print(f"  Total Clicks: {clicks}")
    print(f"  Total Conversions: {conversions}")
    print(f"  Total Cost: {cost}")

if __name__ == '__main__':
    main()

  bids = pd.read_csv(bid_file, sep='\t', header=None, names=bidRequest_cols)
