# Exploratory Analysis for Starbucks Data and Preprocessing

In [1]:
import os

In [2]:
os.listdir()

['.ipynb_checkpoints',
 'Initial Analysis.ipynb',
 'Original Datasources (Do Not Modify)',
 'portfolio.csv',
 'profile.csv',
 'transcript.csv']

Please verify that you've unzipped the file archive & that portfolio.csv, profile.csv, and transcript.csv are all present in your current directory.

## Loading data

In [3]:
import pandas as pd

In [4]:
port = pd.read_csv('portfolio.csv', index_col = 0) # index col = 0 since the index is unnamed in the csvs
prof = pd.read_csv('profile.csv', index_col = 0)
tran = pd.read_csv('transcript.csv', index_col = 0)

## Examining and Preprocessing Data

### Profile Data

In [5]:
prof.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [6]:
prof.isna().sum() # Looks like we are missing the same amount of gender / income.

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

In [7]:
(prof['age'] == 118).value_counts()

False    14825
True      2175
Name: age, dtype: int64

In [8]:
print('Missing data makes up about {}% of total profile rows'.format(round(2175/(14825+2175), 3) * 100))

Missing data makes up about 12.8% of total profile rows


So, all of the missing values occur on the same rows - where one column is missing data, so are the others (gender, age, income). For now, lets just drop them.

In [9]:
prof.dropna(inplace = True)

In [10]:
prof.isna().any() # Verify there are no missing values in prof

gender              False
age                 False
id                  False
became_member_on    False
income              False
dtype: bool

### Transcript Data

In [11]:
tran.isna().sum() # No missing values for tran file!

person    0
event     0
value     0
time      0
dtype: int64

In [12]:
tran.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [13]:
tran['value'].value_counts()

{'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'}    14983
{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'}    14924
{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'}    14891
{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'}    14835
{'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'}    14374
                                                    ...  
{'amount': 62.25}                                       1
{'amount': 82.56}                                       1
{'amount': 635.14}                                      1
{'amount': 42.33}                                       1
{'amount': 105.86}                                      1
Name: value, Length: 5121, dtype: int64

In [14]:
tran[tran['event'] == 'transaction']

Unnamed: 0,person,event,value,time
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,{'amount': 34.56},0
12659,54890f68699049c2a04d415abc25e717,transaction,{'amount': 13.23},0
12670,b2f1cd155b864803ad8334cdf13c4bd2,transaction,{'amount': 19.51},0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,{'amount': 18.97},0
...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714


In [15]:
tran['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [22]:
import ast

In [23]:
def parse_value():
    if i[1]['event'] == 'offer received':
        tran.at[i[0], 'offer_id'] = ast.literal_eval(i[1]['value'])['offer id']
    elif i[1]['event'] == 'offer viewed':
        tran.at[i[0], 'offer_id'] = ast.literal_eval(i[1]['value'])['offer id']
    elif i[1]['event'] == 'offer completed':
        tran.at[i[0], 'reward'] = ast.literal_eval(i[1]['value'])['reward']
        tran.at[i[0], 'offer_id'] = ast.literal_eval(i[1]['value'])['offer_id']
    elif i[1]['event'] == 'transaction':
        tran.at[i[0], 'amount'] = ast.literal_eval(i[1]['value'])['amount']

In [24]:
tran['offer_id'] = None
tran['amount'] = None
tran['reward'] = None

for i in tran.iterrows():
    parse_value()

In [46]:
transactions_only = tran[tran['event'] == 'transaction'].copy().drop(['value','offer_id', 'reward'], axis = 1)
completed_only = tran[tran['event'] == 'offer completed'].copy().drop(['value', 'amount'], axis = 1)
received_only = tran[tran['event'] == 'offer received'].copy().drop(['value', 'amount', 'reward'], axis = 1)
viewed_only = tran[tran['event'] == 'offer viewed'].copy().drop(['value', 'amount', 'reward'], axis = 1)

In [47]:
tran_updated = pd.merge(transactions_only, completed_only, how = 'left', left_on = ['person', 'time'], right_on = ['person', 'time'])

In [48]:
tran_updated = pd.merge(tran_updated, received_only, how = 'left', left_on = ['person', 'offer_id'], right_on = ['person', 'offer_id'])

In [49]:
tran_updated = pd.merge(tran_updated, viewed_only, how = 'left', left_on = ['person', 'offer_id'], right_on = ['person', 'offer_id'])

In [50]:
tran_updated

Unnamed: 0,person,event_x,time_x,amount,event_y,offer_id,reward,event_x.1,time_y,event_y.1,time
0,02c083884c7d45b39cc68e1314fec56c,transaction,0,0.83,,,,,,,
1,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,0,34.56,offer completed,2906b810c7d4411798c6938adc9daaa5,2,offer received,0.0,offer viewed,0.0
2,54890f68699049c2a04d415abc25e717,transaction,0,13.23,,,,,,,
3,b2f1cd155b864803ad8334cdf13c4bd2,transaction,0,19.51,,,,,,,
4,fe97aa22dd3e48c8b143116a8403dd52,transaction,0,18.97,offer completed,fafdcd668e3743c1bb461111dcafc2a4,2,offer received,0.0,offer viewed,6.0
...,...,...,...,...,...,...,...,...,...,...,...
175728,b3a1272bc9904337b331bf348c3e8c17,transaction,714,1.59,,,,,,,
175729,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,714,9.53,,,,,,,
175730,a00058cf10334a308c68e7631c529907,transaction,714,3.61,,,,,,,
175731,76ddbd6576844afe811f1a3c0fbb5bec,transaction,714,3.53,,,,,,,


tran_updated is one row per transaction (including transactions that were never related to offers).

offer_updated is every offer that was sent out (including offers that were never turned into transactions).

In [52]:
offer_updated = pd.merge(received_only,viewed_only, how = 'left', left_on = ['person', 'offer_id'], right_on = ['person', 'offer_id'])

In [54]:
offer_updated = pd.merge(offer_updated, completed_only,  how = 'left', left_on = ['person', 'offer_id'], right_on = ['person', 'offer_id'])

In [56]:
offer_updated = pd.merge(offer_updated, transactions_only, how = 'left', left_on = ['person', 'time'], right_on = ['person', 'time'])

In [57]:
offer_updated

Unnamed: 0,person,event_x,time_x,offer_id,event_y,time_y,event_x.1,time,reward,event_y.1,amount
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,offer viewed,6.0,offer completed,132.0,5,transaction,19.89
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,offer viewed,6.0,,,,,
2,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,offer viewed,624.0,,,,,
3,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,offer viewed,18.0,,,,,
4,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,offer viewed,12.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
115604,d087c473b4d247ccb0abfef59ba12b0e,offer received,576,ae264e3637204a6fb9bb56bc8210ddfd,offer viewed,672.0,offer completed,636.0,10,transaction,20.78
115605,cb23b66c56f64b109d673d5e56574529,offer received,576,2906b810c7d4411798c6938adc9daaa5,,,offer completed,156.0,2,transaction,21.62
115606,6d5f3a774f3d4714ab0c092238f3a1d7,offer received,576,2298d6c36e964ae4a3e7e9706d1fb8c2,offer viewed,588.0,,,,,
115607,9dc1421481194dcd9400aec7c9ae6366,offer received,576,ae264e3637204a6fb9bb56bc8210ddfd,offer viewed,624.0,offer completed,594.0,10,transaction,12.57
