# Creating dataset

This notebook creates and explore the dataset that we will be using for machine learning.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load all data
portfolio = pd.read_csv('./data/final/portfolio.csv')
transcript = pd.read_csv('./data/final/transcript.csv')
profile = pd.read_csv('./data/final/profile.csv')

## Profile data

Customer gender is being represented as `F`, `M`, or `O`, let's one-hot encode it. As always, the first variable will get dropped to avoid the dummy variable trap.

In [3]:
# Encode gender
gender_encoded = pd.get_dummies(profile['gender'], drop_first=True)

# Merge back to profile data
profile_gender_encoded = pd.concat(
    [profile, gender_encoded],
    axis=1
)

# Clean up
profile_gender_encoded.drop('gender', axis=1, inplace=True)

# Show
profile_gender_encoded.head(10)

Unnamed: 0,age,id,income,M,O
0,55.0,0610b486422d4921ae7d2bf64640c50b,112000.0,0,0
1,75.0,78afa995795e4d85b5d9ceeca43f5fef,100000.0,0,0
2,68.0,e2127556f4f64592b11af22de27a7932,70000.0,1,0
3,65.0,389bc3fa690240e798340f5a15918d5c,53000.0,1,0
4,58.0,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,1,0
5,61.0,aa4862eba776480b8bb9c68455b8c2e1,57000.0,0,0
6,26.0,e12aeaf2d47d42479ea1c4ac3d8286c6,46000.0,1,0
7,62.0,31dda685af34476cad5bc968bdb01c53,71000.0,0,0
8,49.0,62cf5e10845442329191fc246e7bcea3,52000.0,1,0
9,57.0,6445de3b47274c759400cd68131d91b4,42000.0,1,0


## Portfolio data

In [6]:
# Create a dictionary to store portfolio validity
portfolio_validity = {
    offer: duration
    for offer, duration
    in portfolio[['id', 'duration']].values
}

portfolio_validity

{'ae264e3637204a6fb9bb56bc8210ddfd': 168,
 '4d5c57ea9a6940dd891ad53e9dbe8da0': 120,
 '3f207df678b143eea3cee63160fa8bed': 96,
 '9b98b8c7a33c4b65b9aebfe6a799e6d9': 168,
 '0b1e1539f2cc45b7b9fa7c272da2e1d7': 240,
 '2298d6c36e964ae4a3e7e9706d1fb8c2': 168,
 'fafdcd668e3743c1bb461111dcafc2a4': 240,
 '5a8bc65990b245e5a138643cd4eb9837': 72,
 'f19421c1d4aa40978ebb69ca19b0e20d': 120,
 '2906b810c7d4411798c6938adc9daaa5': 168}

## Transcript data

In [16]:
transcript.head(10)

Unnamed: 0,amount,reward,person,event,time,offer
12650,,,389bc3fa690240e798340f5a15918d5c,offer viewed,0,f19421c1d4aa40978ebb69ca19b0e20d
12651,,,d1ede868e29245ea91818a903fec04c6,offer viewed,0,5a8bc65990b245e5a138643cd4eb9837
12652,,,102e9454054946fda62242d2e176fdce,offer viewed,0,4d5c57ea9a6940dd891ad53e9dbe8da0
12653,,,02c083884c7d45b39cc68e1314fec56c,offer viewed,0,ae264e3637204a6fb9bb56bc8210ddfd
12654,0.83,,02c083884c7d45b39cc68e1314fec56c,transaction,0,
12655,,,be8a5d1981a2458d90b255ddc7e0d174,offer viewed,0,5a8bc65990b245e5a138643cd4eb9837
12656,,,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,offer viewed,0,2906b810c7d4411798c6938adc9daaa5
12657,34.56,,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,0,
12658,,2.0,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,offer completed,0,2906b810c7d4411798c6938adc9daaa5
12659,13.23,,54890f68699049c2a04d415abc25e717,transaction,0,


In [19]:
from src.utils.parse_transcript import parse_transcript, get_transaction_dataframe

transaction_list = parse_transcript(transcript, portfolio_validity)
transaction_df = get_transaction_dataframe(transaction_list)

transaction_df.head(10)