step 1: Data Preparation

In [1]:
import pandas as pd
import numpy as np

def load_and_clean_data(filepath='E_Commerce_Dataset.csv'):
    try:
        df = pd.read_csv(filepath)
        print(f"1.Data loaded. Total samples: {len(df)}")
    except FileNotFoundError:
        print("File not found.")
        return None

    # Using median for filling
    df['Tenure'] = df['Tenure'].fillna(df['Tenure'].median())
    df['DaySinceLastOrder'] = df['DaySinceLastOrder'].fillna(df['DaySinceLastOrder'].median())
    df['CouponUsed'] = df['CouponUsed'].fillna(0)
    return df

step 2: MDP Components Definition

In [2]:
def build_mdp_components(df):
    # A. State Space Definition 
    # Dimension 1: User Tenure -> New, Mid, Loyal
    df['Tenure_Bin'] = pd.qcut(df['Tenure'], 3, labels=["New", "Mid", "Loyal"]) 

    # Dimension 2: Recency -> Active, Neutral, Inactive
    df['Recency_Bin'] = pd.cut(df['DaySinceLastOrder'], bins=[-1, 2, 7, 1000], labels=["Active", "Neutral", "Inactive"])
    
    # Combine dimensions 
    df['State'] = df['Tenure_Bin'].astype(str) + "_" + df['Recency_Bin'].astype(str)
    states = sorted(df['State'].unique().tolist())
    state_to_idx = {s: i for i, s in enumerate(states)}
    num_states = len(states)
    
    print("\n2.State Space:")
    print(states)

    # B. Action Space Definition
    # Action 0: Do Nothing
    # Action 1: Send Coupon
    df['Action'] = (df['CouponUsed'] > 0).astype(int)
    num_actions = 2
    COUPON_COST = 20 # AssumCost of sending a coupon is $20
    
    print("\n3.Action Space:")
    print(f"0: Do Nothing (Cost 0), 1: Send Coupon (Cost {COUPON_COST})")

    # C. Transition Probability & Reward
    # The index -1 represents the Churn state
    P = np.zeros((num_states, num_actions, num_states + 1))
    R = np.zeros((num_states, num_actions))
    
    REWARD_RETAIN = 100  # Profit from retaining a user
    REWARD_CHURN = -100  # Loss from losing a user

    for s_idx, s_name in enumerate(states):
        for a in [0, 1]:
            subset = df[(df['State'] == s_name) & (df['Action'] == a)]
            
            if len(subset) > 0:
                # Calculate churn probability
                churn_prob = subset['Churn'].mean()
                
                # Transition Logic:
                P[s_idx, a, -1] = churn_prob
                P[s_idx, a, s_idx] = 1.0 - churn_prob
                
                # Immediate Reward Calculation (E[R|s,a]) 
                expected_reward = (1 - churn_prob) * REWARD_RETAIN + churn_prob * REWARD_CHURN
                
                if a == 1: 
                    expected_reward -= COUPON_COST
                
                R[s_idx, a] = expected_reward
            else:
                # Default to a "pessimistic" assumption: 100% Churn if no data exists
                P[s_idx, a, -1] = 1.0 
                R[s_idx, a] = REWARD_CHURN - (COUPON_COST if a == 1 else 0)

    return states, P, R, state_to_idx

step 3: Policy Iteration Algorithm

In [3]:
def policy_iteration(states, P, R, gamma=0.9, theta=1e-4):
    """
    Implements the Policy Iteration algorithm required by the assignment.
    Includes Policy Evaluation and Policy Improvement steps.
    """
    num_states = len(states)
    
    # Initialize Policy: Default to 'Do Nothing' (Action 0) for all states
    policy = np.zeros(num_states, dtype=int)
    V = np.zeros(num_states + 1) 
    
    is_policy_stable = False
    iteration_cnt = 0
    
    print("\n4.Policy Iteration:")
    
    while not is_policy_stable:
        iteration_cnt += 1
        
        # Policy Evaluation 
        while True:
            delta = 0
            for s in range(num_states):
                v = V[s]
                a = policy[s] 
                
                # Bellman Expectation Equation
                V[s] = R[s, a] + gamma * (P[s, a, s] * V[s] + P[s, a, -1] * V[-1])
                
                delta = max(delta, abs(v - V[s]))
            
            if delta < theta:
                break 
        
        # Policy Improvement
        is_policy_stable = True
        
        for s in range(num_states):
            old_action = policy[s]
            
            # Find action that maximizes Q-value
            best_action = old_action
            best_q_value = -float('inf')
            
            for a in [0, 1]:
                # Q(s,a) = R(s,a) + gamma * sum( P(s'|s,a) * V(s') )
                q_val = R[s, a] + gamma * (P[s, a, s] * V[s] + P[s, a, -1] * V[-1])
                if q_val > best_q_value:
                    best_q_value = q_val
                    best_action = a
            
            policy[s] = best_action
            
            if old_action != best_action:
                is_policy_stable = False
                
        print(f" -> Iteration {iteration_cnt}: Is policy stable? {is_policy_stable}")

    return policy, V

step 4: Main Execution and Result Generation

In [4]:
if __name__ == "__main__":
    df = load_and_clean_data()
    
    if df is not None:
        states, P, R, state_to_idx = build_mdp_components(df)
        
        final_policy, final_V = policy_iteration(states, P, R)
        
        # Results
        results = []
        for i, s_name in enumerate(states):
            action_str = "Send Coupon" if final_policy[i] == 1 else "Do Nothing"
            results.append({
                "User State": s_name,
                "Optimal Policy": action_str,
                "Value (LTV)": round(final_V[i], 2)
            })
            
        result_df = pd.DataFrame(results)
        
        print("\n5.FINAL RESULTS:")
        print(result_df)
        
        result_df.to_csv('mdp_policy_results.csv', index=False)

1.Data loaded. Total samples: 5630

2.State Space:
['Loyal_Active', 'Loyal_Inactive', 'Loyal_Neutral', 'Mid_Active', 'Mid_Inactive', 'Mid_Neutral', 'New_Active', 'New_Inactive', 'New_Neutral']

3.Action Space:
0: Do Nothing (Cost 0), 1: Send Coupon (Cost 20)

4.Policy Iteration:
 -> Iteration 1: Is policy stable? False
 -> Iteration 2: Is policy stable? True

5.FINAL RESULTS:
       User State Optimal Policy  Value (LTV)
0    Loyal_Active     Do Nothing       727.72
1  Loyal_Inactive     Do Nothing       782.89
2   Loyal_Neutral     Do Nothing       820.65
3      Mid_Active     Do Nothing       258.87
4    Mid_Inactive    Send Coupon       565.95
5     Mid_Neutral    Send Coupon       419.91
6      New_Active     Do Nothing        56.46
7    New_Inactive     Do Nothing       161.02
8     New_Neutral     Do Nothing       187.07
