# Exploratory Analysis for Starbucks Data and Preprocessing

In [3]:
import os

In [4]:
os.listdir()

['.ipynb_checkpoints',
 'Original Datasources (Do Not Modify)',
 'portfolio.csv',
 'profile.csv',
 'transcript.csv',
 'Untitled.ipynb']

Please verify that you've unzipped the file archive & that portfolio.csv, profile.csv, and transcript.csv are all present in your current directory.

## Loading data

In [7]:
import pandas as pd

In [100]:
port = pd.read_csv('portfolio.csv', index_col = 0) # index col = 0 since the index is unnamed in the csvs
prof = pd.read_csv('profile.csv', index_col = 0)
tran = pd.read_csv('transcript.csv', index_col = 0)

## Examining and Preprocessing Data

### Profile Data

In [28]:
prof.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [30]:
prof.isna().sum() # Looks like we are missing the same amount of gender / income.

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

In [42]:
(prof['age'] == 118).value_counts()

False    14825
True      2175
Name: age, dtype: int64

In [50]:
print('Missing data makes up about {}% of total profile rows'.format(round(2175/(14825+2175), 3) * 100))

Missing data makes up about 12.8% of total profile rows


So, all of the missing values occur on the same rows - where one column is missing data, so are the others (gender, age, income). For now, lets just drop them.

In [56]:
prof.dropna(inplace = True)

In [63]:
prof.isna().any() # Verify there are no missing values in prof

gender              False
age                 False
id                  False
became_member_on    False
income              False
dtype: bool

### Transcript Data

In [101]:
tran.isna().sum() # No missing values for tran file!

person    0
event     0
value     0
time      0
dtype: int64

In [102]:
tran.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [103]:
tran['value'].value_counts()

{'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'}    14983
{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'}    14924
{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'}    14891
{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'}    14835
{'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'}    14374
                                                    ...  
{'amount': 54.99}                                       1
{'amount': 49.94}                                       1
{'amount': 65.33}                                       1
{'amount': 508.19}                                      1
{'amount': 40.89}                                       1
Name: value, Length: 5121, dtype: int64

In [104]:
tran[tran['event'] == 'transaction']

Unnamed: 0,person,event,value,time
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,{'amount': 34.56},0
12659,54890f68699049c2a04d415abc25e717,transaction,{'amount': 13.23},0
12670,b2f1cd155b864803ad8334cdf13c4bd2,transaction,{'amount': 19.51},0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,{'amount': 18.97},0
...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714


In [105]:
tran['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [106]:
def parse_value():
    pass

In [127]:
import ast

In [137]:
tran['offer_id'] = None
tran['amount'] = None

for i in tran.iterrows():
    if i[1]['event'] == 'offer received':
        tran.at[i[0], 'offer_id'] = ast.literal_eval(i[1]['value'])['offer id']
    elif i[1]['event'] == 'offer viewed':
        tran.at[i[0], 'offer_id'] = ast.literal_eval(i[1]['value'])['offer id']
    elif i[1]['event'] == 'offer completed':
        pass
    elif i[1]['event'] == 'transaction':
        tran.at[i[0], 'amount'] = ast.literal_eval(i[1]['value'])['amount']

In [97]:
tran['person'].value_counts()

94de646f7b6041228ca7dec82adb97d2    51
8dbfa485249f409aa223a2130f40634a    49
5e60c6aa3b834e44b822ea43a3efea26    48
d0a80415b84c4df4908b8403b19765e3    48
79d9d4f86aca4bed9290350fb43817c2    48
                                    ..
3a4e53046c544134bb1e7782248631d1     2
e63e42480aae4ede9f07cac49c8c3f78     2
912b9f623b9e4b4eb99b6dc919f09a93     2
fccc9279ba56411f80ffe8ce7e0935cd     2
da7a7c0dcfcb41a8acc7864a53cf60fb     1
Name: person, Length: 17000, dtype: int64

In [138]:
tran[tran['person'] == '94de646f7b6041228ca7dec82adb97d2']

Unnamed: 0,person,event,value,time,offer_id,amount
2276,94de646f7b6041228ca7dec82adb97d2,offer received,{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'},0,f19421c1d4aa40978ebb69ca19b0e20d,
16010,94de646f7b6041228ca7dec82adb97d2,offer viewed,{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'},6,f19421c1d4aa40978ebb69ca19b0e20d,
24531,94de646f7b6041228ca7dec82adb97d2,transaction,{'amount': 7.41},30,,7.41
24532,94de646f7b6041228ca7dec82adb97d2,offer completed,{'offer_id': 'f19421c1d4aa40978ebb69ca19b0e20d...,30,,
42252,94de646f7b6041228ca7dec82adb97d2,transaction,{'amount': 1.47},102,,1.47
55475,94de646f7b6041228ca7dec82adb97d2,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},168,9b98b8c7a33c4b65b9aebfe6a799e6d9,
75256,94de646f7b6041228ca7dec82adb97d2,offer viewed,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},186,9b98b8c7a33c4b65b9aebfe6a799e6d9,
77624,94de646f7b6041228ca7dec82adb97d2,transaction,{'amount': 2.62},192,,2.62
81725,94de646f7b6041228ca7dec82adb97d2,transaction,{'amount': 0.59},204,,0.59
93913,94de646f7b6041228ca7dec82adb97d2,transaction,{'amount': 2.2800000000000002},246,,2.28
