## 03 - Feature Engineering

*Transform call-level data into contact-level features for modeling*

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
project_root = Path.cwd().parent

df = pd.read_csv(
    project_root / 'data' / 'processed' / 'calls_cleaned.csv',
    parse_dates=['date_stamp']
)

### Add Late Night Flag

Calls between 21:00 and 06:00 - the *muffens* hours

In [3]:
df['is_late_night'] = (df['hour'] >= 21) | (df['hour'] < 6)

### Build Contact-Level Features

In [4]:
contact_features = df.groupby('name').agg(
    category=('category', 'first'),
    total_calls=('name', 'count'),
    total_duration=('duration_in_seconds', 'sum'),
    avg_duration=('duration_in_seconds', 'median'),
    max_duration=('duration_in_seconds', 'max'),
    pct_business_hours=('is_business_hours', 'mean'),
    pct_late_night=('is_late_night', 'mean'),
    avg_hour=('hour', 'mean'),
    first_call=('date_stamp', 'min'),
    last_call=('date_stamp', 'max')
).reset_index()

In [5]:
contact_features['days_active'] = (contact_features['last_call'] - contact_features['first_call']).dt.days
contact_features['is_one_off'] = contact_features['total_calls'] == 1
contact_features['calls_per_month'] = contact_features['total_calls'] / (contact_features['days_active'] / 30 + 1)
contact_features['is_active'] = contact_features['last_call'] >= '2024-06-01'

### Preview

In [6]:
contact_features.head(10)

Unnamed: 0,name,category,total_calls,total_duration,avg_duration,max_duration,pct_business_hours,pct_late_night,avg_hour,first_call,last_call,days_active,is_one_off,calls_per_month,is_active
0,ASBIS Africa,Supplier,9,834,96.0,244,1.0,0.0,9.888889,2024-01-26,2024-02-22,27,False,4.736842,False
1,Aaron,Unknown,1,4,4.0,4,1.0,0.0,10.0,2022-05-27,2022-05-27,0,True,1.0,False
2,Abbas,Unknown,2,78,39.0,73,1.0,0.0,10.0,2022-05-27,2022-05-27,0,False,2.0,False
3,Abbott,Unknown,1,18,18.0,18,1.0,0.0,9.0,2023-03-22,2023-03-22,0,True,1.0,False
4,Abdol,Unknown,130,8097,18.0,462,0.615385,0.015385,14.223077,2022-05-27,2022-07-03,37,False,58.208955,False
5,Abdool,Unknown,11,218,8.0,62,0.909091,0.0,11.181818,2023-03-22,2024-09-28,556,False,0.56314,True
6,Abdul,Unknown,26,486,9.5,170,0.192308,0.038462,17.038462,2022-05-27,2023-07-17,416,False,1.748879,False
7,Abdulla,Unknown,3,90,36.0,48,0.0,0.0,17.333333,2024-02-14,2024-05-04,80,False,0.818182,False
8,Abel,Unknown,96,9438,41.5,925,0.041667,0.020833,18.458333,2022-01-01,2024-06-08,889,False,3.133841,True
9,Abraham,Unknown,1,11,11.0,11,1.0,0.0,11.0,2024-02-15,2024-02-15,0,True,1.0,False


In [7]:
contact_features.describe()

Unnamed: 0,total_calls,total_duration,avg_duration,max_duration,pct_business_hours,pct_late_night,avg_hour,first_call,last_call,days_active,calls_per_month
count,2091.0,2091.0,2091.0,2091.0,2091.0,2091.0,2091.0,2091,2091,2091.0,2091.0
mean,11.933046,1230.779531,82.023195,217.619799,0.696408,0.009625,13.02579,2023-03-15 02:50:06.025825024,2023-06-24 14:48:22.725968384,101.498804,1.896536
min,1.0,1.0,1.0,1.0,0.0,0.0,6.0,2022-01-01 00:00:00,2022-01-03 00:00:00,0.0,0.076923
25%,1.0,33.0,13.0,28.0,0.369318,0.0,11.0,2022-06-13 00:00:00,2022-09-26 00:00:00,0.0,1.0
50%,2.0,91.0,38.5,68.0,1.0,0.0,13.0,2023-03-11 00:00:00,2023-07-17 00:00:00,0.0,1.0
75%,4.0,272.0,78.25,156.0,1.0,0.0,15.0,2023-12-08 00:00:00,2024-03-18 00:00:00,36.0,2.0
max,2413.0,222060.0,4801.0,7200.0,1.0,1.0,23.0,2024-10-04 00:00:00,2024-10-04 00:00:00,1006.0,74.018405
std,83.240973,9477.446439,216.531427,604.875805,0.406772,0.075022,2.685936,,,223.114163,3.491529


### Feature Distributions by Category

In [8]:
contact_features.groupby('category').agg(
    num_contacts=('name', 'count'),
    avg_calls=('total_calls', 'mean'),
    avg_duration=('avg_duration', 'mean'),
    avg_pct_business=('pct_business_hours', 'mean'),
    avg_pct_late_night=('pct_late_night', 'mean'),
    pct_one_off=('is_one_off', 'mean')
).round(3)

Unnamed: 0_level_0,num_contacts,avg_calls,avg_duration,avg_pct_business,avg_pct_late_night,pct_one_off
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Family,17,578.059,40.5,0.508,0.032,0.0
Important Contacts,6,97.333,40.5,0.504,0.052,0.167
Service Provider,51,5.941,442.912,0.844,0.009,0.353
Supplier,40,66.475,97.95,0.924,0.0,0.175
Unknown,1977,5.857,72.874,0.69,0.009,0.464


### Our Muffens

How do Duma and Eric look in the feature space?

In [9]:
muffens = ['Duma', 'Eric', 'Alli', 'Park', 'Zohra']
contact_features[contact_features['name'].isin(muffens)]

Unnamed: 0,name,category,total_calls,total_duration,avg_duration,max_duration,pct_business_hours,pct_late_night,avg_hour,first_call,last_call,days_active,is_one_off,calls_per_month,is_active
32,Alli,Unknown,905,44383,22.0,2068,0.659669,0.0,12.898343,2022-01-03,2024-09-28,999,False,26.38484,True
320,Duma,Unknown,210,14915,24.5,1769,0.304762,0.128571,14.038095,2022-01-16,2022-11-03,291,False,19.626168,False
389,Eric,Unknown,219,21099,27.0,1722,0.310502,0.041096,14.305936,2022-01-17,2024-08-01,927,False,6.865204,True
1246,Park,Unknown,287,6937,5.0,372,0.728223,0.0,13.393728,2022-02-28,2023-10-10,589,False,13.909532,False
2062,Zohra,Unknown,203,16865,40.0,818,0.689655,0.0,12.098522,2023-03-12,2024-07-09,485,False,11.825243,True


### Save Features

In [10]:
contact_features.to_csv(project_root / 'data' / 'features' / 'contact_features.csv', index=False)

---

## Summary

Created 14 features for 2,091 contacts:

| Feature | Description |
|---------|-------------|
| `category` | Original label (target for supervised learning) |
| `total_calls` | Number of calls to this contact |
| `total_duration` | Total seconds spent on calls |
| `avg_duration` | Median call length |
| `max_duration` | Longest call |
| `pct_business_hours` | % calls during 8-17 Mon-Fri |
| `pct_late_night` | % calls between 21:00-06:00 |
| `avg_hour` | Average hour of day |
| `first_call` | Relationship start date |
| `last_call` | Most recent call |
| `days_active` | Span of relationship in days |
| `is_one_off` | True if only called once |
| `calls_per_month` | Average monthly frequency |
| `is_active` | True if called recently (2024) |

