In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import ast

# Set plot style
sns.set(style="whitegrid")

# Load datasets
customer_360 = pd.read_csv('../data/customer_360_view.csv', index_col='customer_id')
portfolio = pd.read_csv('../data/portfolio.csv')
transcript = pd.read_csv('../data/transcript.csv')

print("Datasets loaded.")
display(customer_360.head())

Datasets loaded.


Unnamed: 0_level_0,Unnamed: 0,gender,age,became_member_on,income,membership_days,total_amount,transaction_count,average_transaction_value,offer completed,offer received,offer viewed
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0610b486422d4921ae7d2bf64640c50b,1,F,55,2017-07-15,112000.0,376,77.01,3.0,25.67,1.0,2.0,0.0
78afa995795e4d85b5d9ceeca43f5fef,3,F,75,2017-05-09,100000.0,443,159.27,7.0,22.752857,3.0,4.0,4.0
e2127556f4f64592b11af22de27a7932,5,M,68,2018-04-26,70000.0,91,57.73,3.0,19.243333,2.0,4.0,3.0
389bc3fa690240e798340f5a15918d5c,8,M,65,2018-02-09,53000.0,167,36.43,3.0,12.143333,5.0,6.0,6.0
2eeac8d8feae4a8cad5a6af0499a211d,12,M,58,2017-11-11,51000.0,257,15.62,4.0,3.905,1.0,3.0,2.0


## 2. Re-clean Transcript/Portfolio (Quick Prep)
We need the detailed logs to extract Offer Type and Channel preferences.
- Extract `offer_id` from transcript `value` column.
- One-hot encode `channels` in portfolio.

In [2]:
# --- Quick Portfolio Prep ---
portfolio_clean = portfolio.copy()
portfolio_clean.rename(columns={'id': 'offer_id'}, inplace=True)
channels = ['web', 'email', 'mobile', 'social']
for channel in channels:
    portfolio_clean[channel] = portfolio_clean['channels'].apply(lambda x: 1 if channel in x else 0)

# --- Quick Transcript Prep ---
transcript_clean = transcript.copy()
transcript_clean.rename(columns={'person': 'customer_id'}, inplace=True)

# Parse value column
transcript_clean['value'] = transcript_clean['value'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Vectorized extraction (fast)
values_df = pd.DataFrame(transcript_clean['value'].tolist())
if 'offer id' in values_df.columns:
    values_df['offer_id'] = values_df['offer_id'].combine_first(values_df['offer id'])
    values_df.drop(columns=['offer id'], inplace=True)

transcript_clean = pd.concat([transcript_clean, values_df], axis=1)

print("Transcript and Portfolio prepped.")

Transcript and Portfolio prepped.


## 3. Feature: Offer Type Preferences
Merge Transcript with Portfolio (on offer_id). Count completed offers by type (bogo vs discount). Pivot to create `bogo_completed`, `discount_completed` columns per customer.

In [3]:
# Merge Transcript with Portfolio
merged_data = transcript_clean.merge(portfolio_clean, on='offer_id', how='left')

# Filter for 'offer completed' events
completed_offers = merged_data[merged_data['event'] == 'offer completed']

# Count completions by offer_type per customer
offer_type_counts = pd.crosstab(completed_offers['customer_id'], completed_offers['offer_type'])

# Rename columns to be specific
offer_type_counts.columns = [f'{col}_completed' for col in offer_type_counts.columns]

display(offer_type_counts.head())

Unnamed: 0_level_0,bogo_completed,discount_completed
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0009655768c64bdeb2e877511632db8f,1,2
0011e0d4e6b944f998e987f904e8c1e5,1,2
0020c2b971eb4e9188eac86d93036a77,1,2
0020ccbbb6d84e358d3414a3ff76cffd,2,1
003d66b6608740288d6cc97a6903f4f0,0,3


## 4. Feature: Channel Usage
Count interactions (viewed/completed) by channel. This helps understand if a user is "Mobile-first" or "Web-first".

In [4]:
# We care about interations: 'offer viewed' and 'offer completed'
interaction_events = merged_data[merged_data['event'].isin(['offer viewed', 'offer completed'])]

# Group by customer and sum up the channel flags (since rows are duplicated by events, summing 1s works)
# Actually, since 'web', 'mobile' etc are binary flags on the *offer*, 
# if a user completes an offer that has (web=1, mobile=1), they get +1 for web and +1 for mobile.
channel_cols = ['web', 'email', 'mobile', 'social']
channel_usage = interaction_events.groupby('customer_id')[channel_cols].sum()

channel_usage.columns = [f'channel_{col}_count' for col in channel_usage.columns]
display(channel_usage.head())

Unnamed: 0_level_0,channel_web_count,channel_email_count,channel_mobile_count,channel_social_count
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,6.0,7.0,7.0,5.0
00116118485d4dfda04fdbaba9a87b5c,2.0,2.0,2.0,2.0
0011e0d4e6b944f998e987f904e8c1e5,7.0,8.0,6.0,3.0
0020c2b971eb4e9188eac86d93036a77,5.0,6.0,6.0,6.0
0020ccbbb6d84e358d3414a3ff76cffd,6.0,7.0,7.0,5.0


## 5. Feature: Ratios (Offer Completion Rate)
Calculate `offer_completion_rate` = `offer completed` / `offer received`.

In [5]:
# We already have offer_received and offer_completed counts in customer_360 from Phase 1
# Let's verify columns
print("Customer 360 Columns:", customer_360.columns)

# Calculate Rate
# Avoid division by zero: if 0 received, rate is 0 (or NaN, but 0 is safer for clustering usually)
if 'offer received' in customer_360.columns and 'offer completed' in customer_360.columns:
    customer_360['completion_rate'] = customer_360['offer completed'] / customer_360['offer received']
    customer_360['completion_rate'] = customer_360['completion_rate'].fillna(0.0)
    # Clip > 1 artifacts (rare but possible if logging is weird)
    customer_360['completion_rate'] = customer_360['completion_rate'].clip(upper=1.0)
else:
    print("Warning: 'offer received' or 'offer completed' column missing")

display(customer_360[['offer received', 'offer completed', 'completion_rate']].head())

Customer 360 Columns: Index(['Unnamed: 0', 'gender', 'age', 'became_member_on', 'income',
       'membership_days', 'total_amount', 'transaction_count',
       'average_transaction_value', 'offer completed', 'offer received',
       'offer viewed'],
      dtype='object')


Unnamed: 0_level_0,offer received,offer completed,completion_rate
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0610b486422d4921ae7d2bf64640c50b,2.0,1.0,0.5
78afa995795e4d85b5d9ceeca43f5fef,4.0,3.0,0.75
e2127556f4f64592b11af22de27a7932,4.0,2.0,0.5
389bc3fa690240e798340f5a15918d5c,6.0,5.0,0.833333
2eeac8d8feae4a8cad5a6af0499a211d,3.0,1.0,0.333333


## 6. Data Processing & Merging
Join the new features (Offer Type, Channels) to `customer_360`. One-hot encode Gender.

In [7]:
# Merge New Features
# customer_360 is indexed by customer_id
customer_features = customer_360.join(offer_type_counts, how='left')
customer_features = customer_features.join(channel_usage, how='left')

# Fill NaNs from the merge (customers who didn't complete/view offers)
fill_cols = list(offer_type_counts.columns) + list(channel_usage.columns)
customer_features[fill_cols] = customer_features[fill_cols].fillna(0)

# Encode Gender
# Using get_dummies
# For Distance-based clustering (KMeans), drop_first is usually better or keep all if you want specific weights. 
# We'll use drop_first=False for clearer interpretation later (e.g., gender_F, gender_M, gender_O) and to treat categories equally in distance.
if 'gender' in customer_features.columns:
    customer_features = pd.get_dummies(customer_features, columns=['gender'], prefix='gender', drop_first=False)

# Drop non-numeric/unnecessary cols
# became_member_on is a date, we learned membership_days from it.
if 'became_member_on' in customer_features.columns:
    customer_features.drop(columns=['became_member_on'], inplace=True)

display(customer_features.head())
print("Nulls:", customer_features.isnull().sum().sum())

Unnamed: 0_level_0,Unnamed: 0,age,income,membership_days,total_amount,transaction_count,average_transaction_value,offer completed,offer received,offer viewed,completion_rate,bogo_completed,discount_completed,channel_web_count,channel_email_count,channel_mobile_count,channel_social_count,gender_F,gender_M,gender_O
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0610b486422d4921ae7d2bf64640c50b,1,55,112000.0,376,77.01,3.0,25.67,1.0,2.0,0.0,0.5,1.0,0.0,1.0,1.0,1.0,0.0,True,False,False
78afa995795e4d85b5d9ceeca43f5fef,3,75,100000.0,443,159.27,7.0,22.752857,3.0,4.0,4.0,0.75,3.0,0.0,4.0,7.0,7.0,5.0,True,False,False
e2127556f4f64592b11af22de27a7932,5,68,70000.0,91,57.73,3.0,19.243333,2.0,4.0,3.0,0.5,1.0,1.0,5.0,5.0,5.0,2.0,False,True,False
389bc3fa690240e798340f5a15918d5c,8,65,53000.0,167,36.43,3.0,12.143333,5.0,6.0,6.0,0.833333,3.0,2.0,11.0,11.0,11.0,6.0,False,True,False
2eeac8d8feae4a8cad5a6af0499a211d,12,58,51000.0,257,15.62,4.0,3.905,1.0,3.0,2.0,0.333333,0.0,1.0,3.0,3.0,3.0,3.0,False,True,False


Nulls: 0


## 7. Scaling
Normalize numerical variables (`income`, `total_amount`, `membership_days`, etc.) so that one feature doesn't dominate K-Means distance.

In [None]:
scaler = StandardScaler()

# Select columns to scale (all numeric except maybe binary flags? KMeans usually wants everything scaled)
# Let's scale everything that is not a boolean flag. Even binary flags can be scaled or left as 0/1. 
# Usually for KMeans it is safer to scale everything to mean 0 std 1.
features_to_scale = customer_features.columns
customer_features_scaled = pd.DataFrame(scaler.fit_transform(customer_features), 
                                        columns=features_to_scale, 
                                        index=customer_features.index)

# Why are there negative values?
# StandardScaler transforms data to have Mean = 0 and Std Dev = 1.
# Values < Mean become negative. Values > Mean become positive.
# This DOES NOT mean the age is actually negative (-20 years old), it means "20 units below average".
# This is required for K-Means so that 'Income' (range 30,000-100,000) doesn't overpower 'Age' (range 18-100).

display(customer_features_scaled.describe())
print(f"Mean Age after scaling (approx 0): {customer_features_scaled['age'].mean():.4f}")

Unnamed: 0.1,Unnamed: 0,age,income,membership_days,total_amount,transaction_count,average_transaction_value,offer completed,offer received,offer viewed,completion_rate,bogo_completed,discount_completed,channel_web_count,channel_email_count,channel_mobile_count,channel_social_count,gender_F,gender_M,gender_O
count,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0,14825.0
mean,1.226974e-16,-3.570687e-17,-1.260524e-16,-1.037656e-16,4.026009e-17,1.006502e-16,1.0304670000000001e-17,4.385475e-17,2.85655e-16,-1.056827e-16,-3.0674360000000005e-17,8.339591e-17,-1.1502880000000001e-17,2.041762e-16,1.538511e-16,8.171841e-17,5.967121000000001e-17,-8.387520000000001e-17,-6.230729e-18,5.751442e-17
std,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034,1.000034
min,-1.735252,-2.093613,-1.639304,-1.246398,-0.9004893,-1.613384,-0.9029955,-1.383232,-4.174104,-2.604592,-1.472124,-0.9595832,-1.050051,-1.880497,-2.305064,-2.241113,-1.858078,-0.8395277,-1.156702,-0.1204476
25%,-0.8627202,-0.7129632,-0.7595758,-0.7502045,-0.6584929,-0.8415543,-0.6788209,-0.7511761,-0.4519891,-1.055732,-0.8680236,-0.9595832,-1.050051,-0.6298411,-0.6442662,-0.9416398,-0.9107961,-0.8395277,-1.156702,-0.1204476
50%,-0.001421761,0.03488876,-0.06505322,-0.3923724,-0.230748,-0.2626823,-0.05580132,-0.1191206,0.4785398,-0.2813023,0.03812727,-0.02723158,-0.1442553,-0.2129559,-0.2290668,-0.07532417,0.03648552,-0.8395277,0.864527,-0.1204476
75%,0.8651871,0.6676866,0.6757708,0.6548829,0.337578,0.5091471,0.3869493,0.5129349,0.4785398,0.4931278,0.793253,0.90512,0.7615407,0.6208144,0.6013319,0.7909915,0.5101263,1.191146,0.864527,-0.1204476
max,1.736494,2.681134,2.527831,3.102454,11.4778,5.333081,26.43999,2.409101,1.409069,2.041988,1.548379,4.634526,4.384725,3.122125,2.677329,2.956781,3.825612,1.191146,0.864527,8.302364


## 8. Save Final Features
Save `customer_features.csv` (unscaled, for interpretation) and `customer_features_scaled.csv` (for modeling).

In [9]:
customer_features.to_csv('../data/customer_features.csv')
customer_features_scaled.to_csv('../data/customer_features_scaled.csv')

print("Saved features to ../data/customer_features.csv and ../data/customer_features_scaled.csv")

Saved features to ../data/customer_features.csv and ../data/customer_features_scaled.csv
