# Starbucks Capstone

## Loading the Data

Let's load the provided data into some pandas dataframs and gather some basic information about each

In [None]:
import pandas as pd
import numpy as np
import math
import json

import matplotlib.pyplot as plt
% matplotlib inline

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [None]:
# Let's print some information about each our files
portfolio.head()

In [None]:
profile.head()

In [None]:
transcript.head()

## Data successfully loaded

We've now got a peek of each of our DataFrames which have been read in. Let's gather some exploratory information about the breakdown for a few of the stats.

In [None]:
income_unavailable = sum(pd.isnull(profile['income']))
print('Income reported: ', len(profile) - income_unavailable)
print('Income unreported: ', income_unavailable)

clean_profile = profile.dropna(axis=0)
column_name = 'income'

# Lets see an income breakdown and plot it
ax=plt.subplots(figsize=(6,3))
# get data by column_name and display a histogram
ax = plt.hist(clean_profile[column_name], bins=30)
title=f'Histogram of {column_name} among reporters'
plt.title(title, fontsize=12)
plt.show()
    

In [None]:
# Let's see our gender breakdown
print('Total: ', len(profile))
print('Women: ', len(profile[profile['gender'] == 'F']))     
print('Men: ', len(profile[profile['gender'] == 'M']))
print('Other: ', len(profile[profile['gender'] == 'O']))
print('None: ', sum(pd.isnull(profile['gender'])))

In [None]:
# Let's see what type events are available
types = transcript.event.unique()
for event in types:
    print(event, '    \t:\t', len(transcript[transcript['event'] == event]))     

### Thinking about how to proceed with data pre-processing

At this point, we're ready to start transforming our data in order to maximize the amount of usefulness we'll gain from performing the Principal Component Analysis.

Something we want to be able to continue to referenece is the need for our data to be kept within terms of each customer. In order to do that, we'll have to make some modifications to the profile dataFrame and include various statistics derived from the other data.

In [None]:
def user_stats_df(df):
    # make a new copy of the profile dataframe
    new_df = df
    types = transcript.event.unique()         
    
    event_count_map = { 'offer received': [],
                        'offer viewed': [],
                        'transaction': [],
                        'offer completed': [] }
    
    # Let's take a count of each user's records for each event type
    for index, row in new_df.iterrows():    
        pid = row['id']
        user_events = transcript[transcript['person'] == pid]
    
        for event in types:
            # Add the new column with the calculated values for each event type
            event_count_map[event].append(len(user_events[user_events['event'] == event]))
    
    # Now add each column based on the results above
    new_df['received'] = event_count_map['offer received']
    new_df['viewed'] = event_count_map['offer viewed']
    new_df['transactions'] = event_count_map['transaction']
    new_df['completed'] = event_count_map['offer completed']
    
    return new_df

In [None]:
%%time

result = user_stats_df(profile)

In [None]:
print(len(result))
result.head(10)

In [None]:
## TODO

# 1) Convert membership date to age

# 2) need an offer df? columns: id, person, number_of views, initial_time_to_view, time_to_complete
#    avg_response_time add a column for avg offer age when viewed (time(viewed) - time(recieved))
#    avg_completion_time for avg offer age when completed (time(completed) - time(viewed))
