In [4]:
# Setup
import os
import pandas as pd

# We use os.path.join because Windows uses a back slash (\) to separate directories
#  while others use a forward slash (/)
users_file_name = os.path.join('data', 'users.csv')
users_file_name

'data/users.csv'

In [3]:
# Open the file and print out the first 5 lines
with open(users_file_name) as lines:
    for _ in range(5):
        # The `file` object is an iterator, so just get the next line 
        print(next(lines))

,first_name,last_name,email,email_verified,signup_date,referral_count,balance

aaron,Aaron,Davis,aaron6348@gmail.com,True,2018-08-31,6,18.14

acook,Anthony,Cook,cook@gmail.com,True,2018-05-12,2,55.45

adam.saunders,Adam,Saunders,adam@gmail.com,False,2018-05-29,3,72.12

adrian,Adrian,Fang,adrian.fang@teamtreehouse.com,True,2018-04-28,3,30.01



In [5]:
# Create a new `DataFrame` and set the index to the first column
users = pd.read_csv(users_file_name, index_col=0)

In [6]:
users.head()

Unnamed: 0,first_name,last_name,email,email_verified,signup_date,referral_count,balance
aaron,Aaron,Davis,aaron6348@gmail.com,True,2018-08-31,6,18.14
acook,Anthony,Cook,cook@gmail.com,True,2018-05-12,2,55.45
adam.saunders,Adam,Saunders,adam@gmail.com,False,2018-05-29,3,72.12
adrian,Adrian,Fang,adrian.fang@teamtreehouse.com,True,2018-04-28,3,30.01
adrian.blair,Adrian,Blair,adrian9335@gmail.com,True,2018-06-16,7,25.85


In [7]:
# Pythonic approach still works
len(users)

475

In [8]:
users.shape

(475, 7)

In [9]:
#returns how many non-empty values there are in each column
#missing data will show up as np.nan
users.count()

first_name        475
last_name         430
email             475
email_verified    475
signup_date       475
referral_count    475
balance           475
dtype: int64

In [10]:
users.dtypes

first_name         object
last_name          object
email              object
email_verified       bool
signup_date        object
referral_count      int64
balance           float64
dtype: object

In [11]:
users.describe()
#only describes numeric types

Unnamed: 0,referral_count,balance
count,475.0,475.0
mean,3.429474,49.933263
std,2.281085,28.280448
min,0.0,0.05
25%,2.0,25.305
50%,3.0,51.57
75%,5.0,74.48
max,7.0,99.9


In [12]:
# The mean or average
#only shows numeric and bool types
users.mean()

email_verified     0.818947
referral_count     3.429474
balance           49.933263
dtype: float64

In [15]:
# Standard deviation
users.std()

email_verified     0.385468
referral_count     2.281085
balance           28.280448
dtype: float64

In [14]:
# The minimum of each column
users.min()

first_name                       Aaron
email             aalvarez@hotmail.com
email_verified                   False
signup_date                 2018-01-01
referral_count                       0
balance                           0.05
dtype: object

In [16]:
# The maximum of each column
users.max()

first_name                Zachary
email             zneal@gmail.com
email_verified               True
signup_date            2018-09-25
referral_count                  7
balance                      99.9
dtype: object

In [17]:
#since each column is a series we can access counts of each value 
users.email_verified.value_counts()

True     389
False     86
Name: email_verified, dtype: int64

In [18]:
# Most common first name
users.first_name.value_counts().head()

Mark        11
David       10
Michael      9
Joshua       7
Jennifer     7
Name: first_name, dtype: int64

In [19]:
#can create new dataframe with sorted values using sort_values method, ascending default is true 
users.sort_values(by='balance', ascending=False).head()

Unnamed: 0,first_name,last_name,email,email_verified,signup_date,referral_count,balance
twhite,Timothy,White,white5136@hotmail.com,True,2018-07-06,5,99.9
karen.snow,Karen,Snow,ksnow@yahoo.com,True,2018-05-06,2,99.38
king,Billy,King,billy.king@hotmail.com,True,2018-05-29,4,98.8
king3246,Brittney,King,brittney@yahoo.com,True,2018-04-15,6,98.79
crane203,Valerie,Crane,valerie7051@hotmail.com,True,2018-05-12,3,98.69


In [20]:
# sort_values makes a copy but we can do inplace change too
# Sort first by last_name and then first_name. By default, np.nan show up at the end
users.sort_values(by=['last_name', 'first_name'], inplace=True)
# Sort order is now changed
users.head()

Unnamed: 0,first_name,last_name,email,email_verified,signup_date,referral_count,balance
darlene.adams,Darlene,Adams,adams@hotmail.com,True,2018-09-15,2,67.02
lauren,Lauren,Aguilar,lauren.aguilar@summers.com,False,2018-05-31,4,69.9
daniel,Daniel,Allen,allen@hotmail.com,False,2018-07-01,2,21.21
kallen,Kathy,Allen,kathy@hotmail.com,False,2018-02-20,1,43.72
alvarado,Denise,Alvarado,alvarado@hotmail.com,True,2018-09-07,6,26.72


In [21]:
#change back to using index to sort
users.sort_index(inplace=True)
users.head()

Unnamed: 0,first_name,last_name,email,email_verified,signup_date,referral_count,balance
aaron,Aaron,Davis,aaron6348@gmail.com,True,2018-08-31,6,18.14
acook,Anthony,Cook,cook@gmail.com,True,2018-05-12,2,55.45
adam.saunders,Adam,Saunders,adam@gmail.com,False,2018-05-29,3,72.12
adrian,Adrian,Fang,adrian.fang@teamtreehouse.com,True,2018-04-28,3,30.01
adrian.blair,Adrian,Blair,adrian9335@gmail.com,True,2018-06-16,7,25.85
