# 1. Import libraries

In [10]:
# !pip install markovify

In [11]:
import pandas as pd
import numpy as np
from collections import Counter
import markovify

In [21]:
import os
os.getcwd()
os.chdir('/content/drive/MyDrive/BADM550')
os.getcwd()

'/content/drive/MyDrive/BADM550'

# 2. Load Datasets

In [23]:
# Load dataset
df1 = pd.read_excel('Dataset_1.xlsx')
df2 = pd.read_excel('Dataset_2.xlsx')

# Merge datasets
df = pd.concat([df1, df2], ignore_index=True)

# Keep only relevant columns
df = df[['uid', 'timestamp', 'conversion', 'conversion_id', 'click', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']]

# 生成渠道路径：假设cat1 到 cat9 表示用户的转化渠道
df['path'] = df.apply(lambda x: ' > '.join([str(cat) for cat in [x['cat1'], x['cat2'], x['cat3'], x['cat4'], x['cat5'], x['cat6'], x['cat7'], x['cat8'], x['cat9']] if pd.notnull(cat)]), axis=1)

# 3. Develop Markov Chain

In [36]:
from typing import List, Dict
from itertools import chain

# 生成路径列表，将每个uid对应的转化路径（包含是否转化）添加到列表中
paths = df[df['conversion'] == 1].groupby('uid')['path'].apply(list).tolist()
paths = [' > '.join(path) for path in paths]

# 建立Markov链模型：为每个路径生成状态转移矩阵
def generate_transition_matrix(paths: List[str]) -> Dict[str, Dict[str, float]]:
    # 为路径生成状态对
    state_pairs = [path.split(' > ') for path in paths]
    state_pairs = list(chain.from_iterable([[(path[i], path[i + 1]) for i in range(len(path) - 1)] for path in state_pairs]))

    print("State Pairs:", state_pairs)  # 调试：查看状态对生成情况

    # 统计状态对的频率
    state_counts = Counter(state_pairs)
    state_totals = Counter([pair[0] for pair in state_pairs])

    print("State Counts:", state_counts.items())  # 调试：查看状态对频率

    # 计算转移概率
    transition_matrix = {}
    for (state1, state2), count_value in state_counts.items():
        if state1 not in transition_matrix:
            transition_matrix[state1] = {}
        transition_matrix[state1][state2] = count_value / state_totals[state1]

    return transition_matrix

# 创建状态转移矩阵
transition_matrix = generate_transition_matrix(paths)
print("Transition Matrix:", transition_matrix)  # 查看生成的转移矩阵

Output hidden; open in https://colab.research.google.com to view.

# 4. Compute

In [37]:
# 简化的转化率变化模拟函数（随机游走法）
def simulate_conversion_rate_drop(matrix: Dict[str, Dict[str, float]], trials: int = 10000) -> float:
    total_conversion = 0
    for _ in range(trials):
        state = 'start'  # 初始状态
        while state != 'conversion':
            if np.random.rand() < 0.5:  # 50%概率退出
                break
            if state in matrix:
                state = np.random.choice(list(matrix[state].keys()), p=list(matrix[state].values()))
            else:
                break
        if state == 'conversion':
            total_conversion += 1
    return total_conversion / trials

# 移除渠道并计算转化率下降比例
def removal_effect(transition_matrix: Dict[str, Dict[str, float]], remove_channel: str) -> float:
    new_matrix = {k: {inner_k: v for inner_k, v in inner_dict.items() if inner_k != remove_channel}
                  for k, inner_dict in transition_matrix.items() if k != remove_channel}

    # 计算移除渠道后的总转化率变化，简化起见用随机游走法来估计
    # 假设每个状态以50%概率退出
    conversion_rate_drop = simulate_conversion_rate_drop(new_matrix)
    return conversion_rate_drop

# 计算每个渠道的归因值
channels = [f'cat{i}' for i in range(1, 10)]
channel_attribution = {channel: removal_effect(transition_matrix, channel) for channel in channels}

# 5. Print Results

In [38]:
# 展示每个渠道的归因贡献
print("Channel Attribution (Removal Effect):")
for channel, contribution in channel_attribution.items():
    print(f"{channel}: {contribution:.4f}")


Channel Attribution (Removal Effect):
cat1: 0.0000
cat2: 0.0000
cat3: 0.0000
cat4: 0.0000
cat5: 0.0000
cat6: 0.0000
cat7: 0.0000
cat8: 0.0000
cat9: 0.0000
