In [1]:
# Setup
import os

import pandas as pd

pd.options.display.max_rows = 10
users = pd.read_csv(os.path.join('data', 'users.csv'), index_col=0)
transactions = pd.read_csv(os.path.join('data', 'transactions.csv'), index_col=0)
# Sanity check
(len(users), len(transactions))

(475, 998)

In [2]:
transactions.dtypes

sender        object
receiver      object
amount       float64
sent_date     object
dtype: object

In [3]:
#grouping my reciever
grouped_by_receiver = transactions.groupby('receiver')

# Let's see what type of object we got back
type(grouped_by_receiver)

pandas.core.groupby.generic.DataFrameGroupBy

In [4]:
# Returns a Series of total number of rows
grouped_by_receiver.size()

receiver
aaron            6
acook            1
adam.saunders    2
adrian           3
adrian.blair     7
                ..
wilson           2
wking            2
wright3590       4
young            2
zachary.neal     4
Length: 410, dtype: int64

In [5]:
grouped_by_receiver.count()

Unnamed: 0_level_0,sender,amount,sent_date
receiver,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaron,6,6,6
acook,1,1,1
adam.saunders,2,2,2
adrian,3,3,3
adrian.blair,7,7,7
...,...,...,...
wilson,2,2,2
wking,2,2,2
wright3590,4,4,4
young,2,2,2


In [6]:
grouped_by_receiver.sum()

Unnamed: 0_level_0,amount
receiver,Unnamed: 1_level_1
aaron,366.15
acook,94.65
adam.saunders,101.15
adrian,124.36
adrian.blair,462.88
...,...
wilson,44.39
wking,74.07
wright3590,195.45
young,83.57


In [8]:
# Create a new column in users called transaction count, and set the values to the size of the matching group
users['transaction_count'] = grouped_by_receiver.size()
# Not every user has made a transaction, let's see what kind of missing data we are dealing with
len(users[users.transaction_count.isna()])

65

In [9]:
# Set all missing data to 0, since in reality, there have been 0 received transactions for this user
users.transaction_count.fillna(0, inplace=True)
users

Unnamed: 0,first_name,last_name,email,email_verified,signup_date,referral_count,balance,transaction_count
aaron,Aaron,Davis,aaron6348@gmail.com,True,2018-08-31,6,18.14,6.0
acook,Anthony,Cook,cook@gmail.com,True,2018-05-12,2,55.45,1.0
adam.saunders,Adam,Saunders,adam@gmail.com,False,2018-05-29,3,72.12,2.0
adrian,Adrian,Fang,adrian.fang@teamtreehouse.com,True,2018-04-28,3,30.01,3.0
adrian.blair,Adrian,Blair,adrian9335@gmail.com,True,2018-06-16,7,25.85,7.0
...,...,...,...,...,...,...,...,...
wilson,Robert,Wilson,robert@yahoo.com,False,2018-05-16,5,59.75,2.0
wking,Wanda,King,wanda.king@holt.com,True,2018-06-01,2,67.08,2.0
wright3590,Jacqueline,Wright,jacqueline.wright@gonzalez.com,True,2018-02-08,6,18.48,4.0
young,Jessica,Young,jessica4028@yahoo.com,True,2018-07-17,4,75.39,2.0


In [10]:
# Convert from the default type of float64 to int64 (no precision needed)
users.transaction_count = users.transaction_count.astype('int64')

In [11]:
# Sort our values by the new field descending (so the largest comes first), and then by first name ascending
users.sort_values(
    ['transaction_count', 'first_name'],
    ascending=[False, True],
    inplace=True
)
# Take a look at our top 10 receivers, showing only the columns we want
users.loc[:, ['first_name', 'last_name', 'email', 'transaction_count']].head(10)

Unnamed: 0,first_name,last_name,email,transaction_count
scott3928,Scott,,scott@yahoo.com,9
sfinley,Samuel,Finley,samuel@gmail.com,8
adrian.blair,Adrian,Blair,adrian9335@gmail.com,7
hdeleon,Hannah,Deleon,hannah@yahoo.com,7
miranda6426,Miranda,Rogers,miranda.rogers@gmail.com,7
aaron,Aaron,Davis,aaron6348@gmail.com,6
corey,Corey,Fuller,fuller8100@yahoo.com,6
heather,Heather,Ray,hray@yahoo.com,6
jennifer.hebert,Jennifer,Hebert,jennifer.hebert@yahoo.com,6
edwards,Michael,Edwards,edwards5456@gmail.com,6
