### Chapter 3 - More Data Processing with Pandas

# Table of Contents

3.1 More Data Processing with Pandas

- Mering DataFrames
- Pandas Idioms
- Group by
- Scales
- Pivot Table
- Date/Time Functionality


# 3.1 More Data Processing with Pandas

## 1. Merging DataFrames

In [1]:
import pandas as pd

staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},
                         {'Name': 'Sally', 'Role': 'Course liasion'},
                         {'Name': 'James', 'Role': 'Grader'}])

staff_df = staff_df.set_index('Name')

student_df = pd.DataFrame([{'Name': 'James', 'School': 'Bsuiness'},
                           {'Name': 'Mike', 'School': 'Law'},
                           {'Name': 'Sally', 'School': 'Engineering'}])

student_df = student_df.set_index('Name')

print("[Staff]:")
print(staff_df.head())
print("---------------------")
print("[Student]:")
print(student_df.head())


[Staff]:
                 Role
Name                 
Kelly  Director of HR
Sally  Course liasion
James          Grader
---------------------
[Student]:
            School
Name              
James     Bsuiness
Mike           Law
Sally  Engineering


#### merge( )

In [2]:
# Outer Join on the left and the right indexes (Name)
pd.merge(staff_df, student_df, how = 'outer', left_index = True, right_index = True)


Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Bsuiness
Kelly,Director of HR,
Mike,,Law
Sally,Course liasion,Engineering


In [3]:
# Inner Join on the left and the right indexes (Name)
pd.merge(staff_df, student_df, how = 'inner', left_index = True, right_index = True)


Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Sally,Course liasion,Engineering
James,Grader,Bsuiness


In [4]:
# Left Join on the left and the right indexes (Name)
pd.merge(staff_df, student_df, how = 'left', left_index = True, right_index = True)


Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Kelly,Director of HR,
Sally,Course liasion,Engineering
James,Grader,Bsuiness


In [5]:
# Right Join on the left and the right indexes (Name)
pd.merge(staff_df, student_df, how = 'right', left_index = True, right_index = True)


Unnamed: 0_level_0,Role,School
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
James,Grader,Bsuiness
Mike,,Law
Sally,Course liasion,Engineering


#### Alternative Way:

In [6]:
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()

# Right Join on 'Name' column
pd.merge(staff_df, student_df, how = 'right', on = 'Name')


Unnamed: 0,Name,Role,School
0,Sally,Course liasion,Engineering
1,James,Grader,Bsuiness
2,Mike,,Law


#### Conflicts between the DataFrames?

In [7]:
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR',
                          'Location': 'State Street'},
                         {'Name': 'Sally', 'Role': 'Course liaison',
                         'Location': 'Washington Avenue'},
                         {'Name': 'James', 'Role': 'Grader',
                         'Location': 'Washington Avenue'}])

student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business',
                         'Location': '1024 Billiard Avenue'},
                            {'Name': 'Mike', 'School': 'Law',
                         'Location': 'Fraternity House #22'},
                            {'Name': 'Sally', 'School': 'Engineering',
                         'Location': '512 Wilson Crescent'}])

print("[Staff]:")
print(staff_df)
print("---------------------------------------------------------")
print("[Student]:")
print(student_df)


[Staff]:
    Name            Role           Location
0  Kelly  Director of HR       State Street
1  Sally  Course liaison  Washington Avenue
2  James          Grader  Washington Avenue
---------------------------------------------------------
[Student]:
    Name       School              Location
0  James     Business  1024 Billiard Avenue
1   Mike          Law  Fraternity House #22
2  Sally  Engineering   512 Wilson Crescent


- _x is always the left DataFrame information, and _y is always the right DdataFrame information

In [8]:
# Left Join on the 'Name' column
pd.merge(staff_df, student_df, how = 'left', on = 'Name')


Unnamed: 0,Name,Role,Location_x,School,Location_y
0,Kelly,Director of HR,State Street,,
1,Sally,Course liaison,Washington Avenue,Engineering,512 Wilson Crescent
2,James,Grader,Washington Avenue,Business,1024 Billiard Avenue


#### Multi-indexing and multiple columns

In [9]:
staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins',
                         'Role': 'Director of HR'}, 
                        {'First Name': 'Sally', 'Last Name': 'Brooks',
                         'Role': 'Course liaison'},
                        {'First Name': 'James', 'Last Name': 'Wilde',
                         'Role': 'Grader'}])

student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond',
                            'School': 'Business'},
                          {'First Name': 'Mike', 'Last Name': 'Smith',
                           'School': 'Law'},
                          {'First Name': 'Sally', 'Last Name': 'Brooks',
                           'School': 'Engineering'}])

print("[Staff]:")
print(staff_df)
print("----------------------------------------")
print("[Student]:")
print(student_df)


[Staff]:
  First Name   Last Name            Role
0      Kelly  Desjardins  Director of HR
1      Sally      Brooks  Course liaison
2      James       Wilde          Grader
----------------------------------------
[Student]:
  First Name Last Name       School
0      James   Hammond     Business
1       Mike     Smith          Law
2      Sally    Brooks  Engineering


In [10]:
# Inner Join on First Name an Last Name
pd.merge(staff_df, student_df, how = 'inner', on = ['First Name', 'Last Name'])


Unnamed: 0,First Name,Last Name,Role,School
0,Sally,Brooks,Course liaison,Engineering


#### Concatenating Multiple DataFrames

In [11]:
%%capture 
# To suppress some of the Jupyter warning messages and just tell read_csv to ignore bad lines

df_2011 = pd.read_csv("MERGED2011_12_PP.csv", error_bad_lines = False)  # Save as 
df_2012 = pd.read_csv("MERGED2012_13_PP.csv", error_bad_lines = False)  # CSV UTF-8 (Comma delimited) (.csv)
df_2013 = pd.read_csv("MERGED2013_14_PP.csv", error_bad_lines = False)


In [12]:
df_2011.iloc[:5, :10]


Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL
0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,
1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,
2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,
3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,
4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,


In [13]:
print(len(df_2011))
print(len(df_2012))
print(len(df_2013))


7675
7793
7804


Concatenating

In [14]:
frames = [df_2011, df_2012, df_2013]
pd.concat(frames).iloc[:5, :10]


Unnamed: 0,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL
0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,
1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,
2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,
3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,
4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,


In [15]:
len(df_2011) + len(df_2012) + len(df_2013)


23272

In [16]:
# To differentiate which data is coming from which year, we use the keys parameter
pd.concat(frames, keys = ['2011', '2012', '2013']).iloc[:5, :10]


Unnamed: 0,Unnamed: 1,UNITID,OPEID,OPEID6,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,INSTURL,NPCURL
2011,0,100654,100200,1002,Alabama A & M University,Normal,AL,35762,,,
2011,1,100663,105200,1052,University of Alabama at Birmingham,Birmingham,AL,35294-0110,,,
2011,2,100690,2503400,25034,Amridge University,Montgomery,AL,36117-3553,,,
2011,3,100706,105500,1055,University of Alabama in Huntsville,Huntsville,AL,35899,,,
2011,4,100724,100500,1005,Alabama State University,Montgomery,AL,36104-0271,,,


## 2. Pandas Idioms

In [17]:
import pandas as pd
import numpy as np
import timeit

df = pd.read_csv('census.txt')
df.iloc[:5, :10]


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870


In [18]:
(df.where(df['SUMLEV'] == 50)            # df.where(a boolean mask of True/Flase)
   .dropna()                             # .where() does not drop NA values as a default
   .set_index(['STNAME', 'CTYNAME'])     # set index as 'STNAME' followed by 'CTYNAME'
   .rename(columns = {'ESTIMATEBASE2010': 'Estimates Base 2010'})).iloc[:5, :10]


Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,Autauga County,50.0,3.0,6.0,1.0,1.0,54571.0,54597.0,54773.0,55227.0,54954.0
Alabama,Baldwin County,50.0,3.0,6.0,1.0,3.0,182265.0,182265.0,183112.0,186558.0,190145.0
Alabama,Barbour County,50.0,3.0,6.0,1.0,5.0,27457.0,27455.0,27327.0,27341.0,27169.0
Alabama,Bibb County,50.0,3.0,6.0,1.0,7.0,22915.0,22915.0,22870.0,22745.0,22667.0
Alabama,Blount County,50.0,3.0,6.0,1.0,9.0,57322.0,57322.0,57376.0,57560.0,57580.0


#### An alternative, non-pandorable way

In [19]:
df = df[df['SUMLEV'] == 50]
df.set_index(['STNAME', 'CTYNAME'], inplace = True)    # inplace = True means modify the DF, not make a copy
df.rename(columns = {'ESTIMATESBASE2010': 'Estimates Base 2010'}).iloc[:5, :10]


Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,Estimates Base 2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54597,54773,55227,54954
Alabama,Baldwin County,50,3,6,1,3,182265,182265,183112,186558,190145
Alabama,Barbour County,50,3,6,1,5,27457,27455,27327,27341,27169
Alabama,Bibb County,50,3,6,1,7,22915,22915,22870,22745,22667
Alabama,Blount County,50,3,6,1,9,57322,57322,57376,57560,57580


#### Comparing the two methods above, in terms of time

In [20]:
# The first approach
def first_approach() :
    global df
    return (df.where(df['SUMLEV'] == 50)            
              .dropna()                             
              .set_index(['STNAME', 'CTYNAME'])     
              .rename(columns = {'ESTIMATEBASE2010': 'Estimates Base 2010'}))

# Read in our dataset anew, to be fresh
df = pd.read_csv('census.txt')

timeit.timeit(first_approach, number = 10)


0.7457618880000005

In [21]:
# The second approach
def second_approach() :
    global df
    new_df = df[df['SUMLEV'] == 50]
    new_df.set_index(['STNAME', 'CTYNAME'], inplace = True)    
    return new_df.rename(columns = {'ESTIMATESBASE2010': 'Estimates Base 2010'})

# Read in our dataset anew, to be fresh
df = pd.read_csv('census.txt')

timeit.timeit(second_approach, number = 10)


0.06557216599999904

The second approach is much faster! (and much readable)

#### apply( ) function

In [22]:
def min_max(row) :
    data = row[['POPESTIMATE2010',    # 6 different columns
                'POPESTIMATE2011',
                'POPESTIMATE2012',
                'POPESTIMATE2013',
                'POPESTIMATE2014',
                'POPESTIMATE2015']]
    return pd.Series({'min': np.min(data), 'max': np.max(data)})


In [23]:
df.apply(min_max, axis = 'columns').iloc[:10, :10]


Unnamed: 0,min,max
0,4785437,4852347
1,54727,55227
2,183112,202939
3,26283,27341
4,22521,22870
5,57376,57619
6,10400,10876
7,20162,20932
8,115469,118408
9,33977,34139


#### To add min, max columns to the original DataFrame:

In [24]:
def min_max(row) :
    data = row[['POPESTIMATE2010',    # 6 different columns
                'POPESTIMATE2011',
                'POPESTIMATE2012',
                'POPESTIMATE2013',
                'POPESTIMATE2014',
                'POPESTIMATE2015']]
    row['max'] = np.max(data)
    row['min'] = np.min(data)
    return row

df.apply(min_max, axis = 'columns').iloc[:5, :10]


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870


#### apply( ) function is typically used with lambdas.

In [25]:
# Calculating the max of the columns, using the apply functions
rows = ['POPESTIMATE2010', 'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014', 
        'POPESTIMATE2015']

df.apply(lambda x: np.max(x[rows]), axis = 1).head(10)    # lambda x: np.max(x[rows]) -> returns a single value



0    4852347
1      55227
2     202939
3      27341
4      22870
5      57619
6      10876
7      20932
8     118408
9      34139
dtype: int64

- Dividing the states into four categories: North East, Mid West, South, West

In [26]:
def get_state_region(x) :
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hamsphire',
                 'Rhode Istland', 'Vermont', 'New York', 'New Jersey', 'Pennsylvania']
    midwest = ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa',
              'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota',
              'South Dakota']
    south = ['Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina',
            'South Carolina', 'Virginia', 'District of Columnbia', 'West Virginia',
            'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas',
            'Louisiana', 'Oklahoma', 'Texas']
    west = ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah',
           'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']
    
    if x in northeast :
        return "Northeast"
    elif x in midwest :
        return "Midwest"
    elif x in south :
        return "South"
    else :
        return "West"
    

In [27]:
df['state_region'] = df['STNAME'].apply(lambda x: get_state_region(x))
df[['STNAME', 'state_region']].head(10)


Unnamed: 0,STNAME,state_region
0,Alabama,South
1,Alabama,South
2,Alabama,South
3,Alabama,South
4,Alabama,South
5,Alabama,South
6,Alabama,South
7,Alabama,South
8,Alabama,South
9,Alabama,South


## 3. Group By

### Splitting
df.groupby(level = , by = )

In [28]:
import pandas as pd
import numpy as np


In [29]:
df = pd.read_csv('census.txt')
df = df[df['SUMLEV'] == 50]
df.iloc[:5, :10]


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57376


In [30]:
%%timeit -n 3

for state in df['STNAME'].unique() :
    avg = np.average(df.where(df['STNAME'] == state).dropna()['CENSUS2010POP'])
    # print('Counties in state ' + state + ' have an average population of ' + str(avg) + ' in 2010')
                     

3.04 s ± 70.1 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


In [31]:
%%timeit -n 3

for group, frame in df.groupby('STNAME') :      
    avg = np.average(frame['CENSUS2010POP'])
    # print('Counties in state ' + group + ' have an average population of ' + str(avg))
    
    
# .groupby() returns a tuple, 
# where the first value is the value of the key we were trying to group by (state)
# and the second value is projected dataframe that was found for that group


15.1 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 3 loops each)


#### Another example:

In [32]:
# You need to set the index of the dataframe to be the column that you want to group by
df = df.set_index('STNAME')

def set_batch_number(item) :
    if item[0] < 'M' :
        return 0              # group 0
    if item[0] < 'Q' :
        return 1              # group 1
    return 2                  # group 2


for group, frame in df.groupby(by = set_batch_number) :
    print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing')
    # print(frame)

    
# Here, we didn't pass in a column name to groupby(axis = ?). 
# Instead, we set the index of the dataframe to be STNAME and, if no column identifier is passed groupby(), 
# the index(0) will be atuomatically used as its parameter. groupby(level = 0)
# Only works for single indexing. Need to specify levels for multiple indexing.
    

There are 1177 records in group 0 for processing
There are 1134 records in group 1 for processing
There are 831 records in group 2 for processing


#### One more example:

In [67]:
df = pd.read_csv('airbnb_listings.csv')
df.iloc[:5, 1:10]


Unnamed: 0,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview
0,https://www.airbnb.com/rooms/10080,20200000000000.0,2/6/19,D1 - Million Dollar View 2 BR,"Stunning two bedroom, two bathroom apartment. ...","Bed setup: 2 x queen, I can add up to 2 twin s...","Stunning two bedroom, two bathroom apartment. ...",none,
1,https://www.airbnb.com/rooms/11400,20200000000000.0,2/6/19,Central Lovely Rm in Victorian Home,Well-appointed room with a view of the garden ...,"Centrally-located lovely, quiet home on tree-l...",Well-appointed room with a view of the garden ...,none,"Very quiet residential area, yet only 1-1/2 bl..."
2,https://www.airbnb.com/rooms/13188,20200000000000.0,2/6/19,Garden level studio in ideal loc.,Garden level studio suite with garden patio - ...,"Explore Vancouver in a highly sought after, tr...",Garden level studio suite with garden patio - ...,none,
3,https://www.airbnb.com/rooms/13357,20200000000000.0,2/6/19,! Wow! 2bed 2bath 1bed den Harbour View Apartm...,Very spacious and comfortable with very well k...,"Mountains and harbour view 2 bedroom,2 bath,1 ...",Very spacious and comfortable with very well k...,none,Amanzing bibrant professional neighbourhood. C...
4,https://www.airbnb.com/rooms/13358,20200000000000.0,2/6/19,Urban Boutique Suite heart of Downtown Vancouver!,,Welcome to the Electra Building on Nelson stre...,Welcome to the Electra Building on Nelson stre...,none,


In [68]:
# print(df['cancellation_policy'])
# print(df['review_scores_value'])

df = df.set_index(['cancellation_policy', 'review_scores_value'])
df.iloc[1:10, 1:5]
    

Unnamed: 0_level_0,Unnamed: 1_level_0,listing_url,scrape_id,last_scraped,name
cancellation_policy,review_scores_value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
strict_14_with_grace_period,9.0,https://www.airbnb.com/rooms/11400,20200000000000.0,2/6/19,Central Lovely Rm in Victorian Home
moderate,10.0,https://www.airbnb.com/rooms/13188,20200000000000.0,2/6/19,Garden level studio in ideal loc.
strict_14_with_grace_period,8.0,https://www.airbnb.com/rooms/13357,20200000000000.0,2/6/19,! Wow! 2bed 2bath 1bed den Harbour View Apartm...
strict_14_with_grace_period,9.0,https://www.airbnb.com/rooms/13358,20200000000000.0,2/6/19,Urban Boutique Suite heart of Downtown Vancouver!
strict_14_with_grace_period,10.0,https://www.airbnb.com/rooms/13490,20200000000000.0,2/6/19,Vancouver's best kept secret
strict_14_with_grace_period,9.0,https://www.airbnb.com/rooms/14267,20200000000000.0,2/6/19,EcoLoft Vancouver
moderate,10.0,https://www.airbnb.com/rooms/14508,20200000000000.0,2/6/19,Yaletown - Sea Wall
strict_14_with_grace_period,9.0,https://www.airbnb.com/rooms/16254,20200000000000.0,2/6/19,Close to PNE/Hastings Park and East Village
strict_14_with_grace_period,7.0,https://www.airbnb.com/rooms/16611,20200000000000.0,2/6/19,Broadway skytrain station


In [69]:
for group, frame in df.groupby(level = (0, 1)) :        # level 1 = canc. policy, level 2 = review.score
    print(group, str(len(frame)))
    

('flexible', 2.0) 1
('flexible', 4.0) 3
('flexible', 5.0) 1
('flexible', 6.0) 4
('flexible', 7.0) 5
('flexible', 8.0) 48
('flexible', 9.0) 200
('flexible', 10.0) 332
('moderate', 2.0) 1
('moderate', 4.0) 2
('moderate', 5.0) 1
('moderate', 6.0) 9
('moderate', 7.0) 6
('moderate', 8.0) 40
('moderate', 9.0) 422
('moderate', 10.0) 761
('strict_14_with_grace_period', 2.0) 2
('strict_14_with_grace_period', 4.0) 3
('strict_14_with_grace_period', 5.0) 2
('strict_14_with_grace_period', 6.0) 8
('strict_14_with_grace_period', 7.0) 28
('strict_14_with_grace_period', 8.0) 160
('strict_14_with_grace_period', 9.0) 1000
('strict_14_with_grace_period', 10.0) 1089
('super_strict_30', 9.0) 1


In [70]:
def grouping_fun(item) :
    
    if item[1] == 10.0 :
        return (item[0], "10.0")                    # this becomes a group
    else :
        return (item[0], "not 10.0")                # this becomes another group
    
for group, frame in df.groupby(grouping_fun) :      # must specify either by = OR level =
    print(group, str(len(frame)))                                    # level = 'cancellation_policy' once by = is specified


('flexible', '10.0') 332
('flexible', 'not 10.0') 558
('moderate', '10.0') 761
('moderate', 'not 10.0') 595
('strict_14_with_grace_period', '10.0') 1089
('strict_14_with_grace_period', 'not 10.0') 1501
('super_strict_30', 'not 10.0') 1


### Aggregation
.agg({'column1': (np.function1, np.function2), 'column2': np.function3})

In [71]:
df = df.reset_index()

df.groupby('cancellation_policy').agg({'review_scores_value': np.average})


Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,
moderate,
strict_14_with_grace_period,
super_strict_30,9.0


In [72]:
df.groupby('cancellation_policy').agg({'review_scores_value': np.nanmean})


Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,9.397306
moderate,9.532206
strict_14_with_grace_period,9.354276
super_strict_30,9.0


In [73]:
# calling multiple functions as a tuple() or multiple columns in the dictionary
df.groupby('cancellation_policy').agg({'review_scores_value': (np.nanmean, np.nanstd),
                                       'reviews_per_month': np.nanmean})


Unnamed: 0_level_0,review_scores_value,review_scores_value,reviews_per_month
Unnamed: 0_level_1,nanmean,nanstd,nanmean
cancellation_policy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
flexible,9.397306,0.901784,1.949142
moderate,9.532206,0.734338,2.442953
strict_14_with_grace_period,9.354276,0.767728,2.057909
super_strict_30,9.0,,0.47


### Transformation
.transform(np.function1)

- returns an object that is the same size as the group

In [85]:
cols = ['cancellation_policy', 'review_scores_value']

transform_df = df[cols].groupby('cancellation_policy').transform(np.nanmean)
transform_df.head(10)    # preserves the original index order from the pre-grouping state


Unnamed: 0,review_scores_value
0,9.354276
1,9.354276
2,9.532206
3,9.354276
4,9.354276
5,9.354276
6,9.354276
7,9.532206
8,9.354276
9,9.354276


.rename({'original_column': 'new_name'})

In [86]:
# renaming the column in the transformed version
transform_df.rename({'review_scores_value': 'mean_review_socres'}, axis = 'columns', inplace = True)
transform_df.head(10)


Unnamed: 0,mean_review_socres
0,9.354276
1,9.354276
2,9.532206
3,9.354276
4,9.354276
5,9.354276
6,9.354276
7,9.532206
8,9.354276
9,9.354276


.merge( )

In [87]:
# df's index and transform_df's index are the same. So we merge on them
df = df.merge(transform_df, left_index = True, right_index = True)
df.iloc[:10]


Unnamed: 0,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,description,...,instant_bookable,is_business_travel_ready,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,mean_review_socres
0,strict_14_with_grace_period,9.0,10080,https://www.airbnb.com/rooms/10080,20200000000000.0,2/6/19,D1 - Million Dollar View 2 BR,"Stunning two bedroom, two bathroom apartment. ...","Bed setup: 2 x queen, I can add up to 2 twin s...","Stunning two bedroom, two bathroom apartment. ...",...,f,f,f,f,31,31,0,0,0.18,9.354276
1,strict_14_with_grace_period,9.0,11400,https://www.airbnb.com/rooms/11400,20200000000000.0,2/6/19,Central Lovely Rm in Victorian Home,Well-appointed room with a view of the garden ...,"Centrally-located lovely, quiet home on tree-l...",Well-appointed room with a view of the garden ...,...,f,f,t,t,1,0,1,0,0.64,9.354276
2,moderate,10.0,13188,https://www.airbnb.com/rooms/13188,20200000000000.0,2/6/19,Garden level studio in ideal loc.,Garden level studio suite with garden patio - ...,"Explore Vancouver in a highly sought after, tr...",Garden level studio suite with garden patio - ...,...,t,f,f,f,1,1,0,0,1.51,9.532206
3,strict_14_with_grace_period,8.0,13357,https://www.airbnb.com/rooms/13357,20200000000000.0,2/6/19,! Wow! 2bed 2bath 1bed den Harbour View Apartm...,Very spacious and comfortable with very well k...,"Mountains and harbour view 2 bedroom,2 bath,1 ...",Very spacious and comfortable with very well k...,...,f,f,t,t,2,2,0,0,0.51,9.354276
4,strict_14_with_grace_period,9.0,13358,https://www.airbnb.com/rooms/13358,20200000000000.0,2/6/19,Urban Boutique Suite heart of Downtown Vancouver!,,Welcome to the Electra Building on Nelson stre...,Welcome to the Electra Building on Nelson stre...,...,f,f,f,t,1,1,0,0,3.65,9.354276
5,strict_14_with_grace_period,10.0,13490,https://www.airbnb.com/rooms/13490,20200000000000.0,2/6/19,Vancouver's best kept secret,This apartment rents for one month blocks of t...,"Vancouver city central, 700 sq.ft., main floor...",This apartment rents for one month blocks of t...,...,f,f,f,f,1,1,0,0,0.88,9.354276
6,strict_14_with_grace_period,9.0,14267,https://www.airbnb.com/rooms/14267,20200000000000.0,2/6/19,EcoLoft Vancouver,"The Ecoloft is located in the lovely, family r...",West Coast Modern Laneway House Loft: We call ...,"The Ecoloft is located in the lovely, family r...",...,t,f,f,f,1,1,0,0,0.31,9.354276
7,moderate,10.0,14508,https://www.airbnb.com/rooms/14508,20200000000000.0,2/6/19,Yaletown - Sea Wall,Long term bookings available for 6 months from...,"Long term, furnished rental from May 1st Spaci...",Long term bookings available for 6 months from...,...,f,f,f,f,1,1,0,0,0.28,9.532206
8,strict_14_with_grace_period,9.0,16254,https://www.airbnb.com/rooms/16254,20200000000000.0,2/6/19,Close to PNE/Hastings Park and East Village,,"Location, Quality, Cleanliness, and Amenities....","Location, Quality, Cleanliness, and Amenities....",...,t,f,f,f,1,0,1,0,0.48,9.354276
9,strict_14_with_grace_period,7.0,16611,https://www.airbnb.com/rooms/16611,20200000000000.0,2/6/19,Broadway skytrain station,"My place is close to downtown, Donald's Market...",1 to 3 bedrooms Balcony 5-minute ride to downt...,"My place is close to downtown, Donald's Market...",...,f,f,f,f,9,4,4,1,0.22,9.354276


np.absolute( )

In [90]:
# So now we could create, for instance, the difference between a given row and
# its group (cancellation policy) means.
df['mean_diff'] = np.absolute(df['review_scores_value'] - df['mean_review_socres'])
df['mean_diff'].head()


0    0.354276
1    0.354276
2    0.467794
3    1.354276
4    0.354276
Name: mean_diff, dtype: float64

### Filtering
.filter(function1)

- applies function1 to each group dataframe and returns True/False

In [107]:
# applies       np.nanmean( cancl_polcy_group1['review_scores_value'] )      >      9.2        -> True/False
df.groupby('cancellation_policy').filter(lambda x: np.nanmean(x['review_scores_value']) > 9.2).iloc[:5, :10]

# Notice the results are still indexed, but that any of the results which were in a group with a mean
# review score of less than or equal to 9.2 were not copied over.

# Note that this routine does not filter a dataframe on its contents. 
# The filter is applied to the labels of the index.


Unnamed: 0,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,description
0,strict_14_with_grace_period,9.0,10080,https://www.airbnb.com/rooms/10080,20200000000000.0,2/6/19,D1 - Million Dollar View 2 BR,"Stunning two bedroom, two bathroom apartment. ...","Bed setup: 2 x queen, I can add up to 2 twin s...","Stunning two bedroom, two bathroom apartment. ..."
1,strict_14_with_grace_period,9.0,11400,https://www.airbnb.com/rooms/11400,20200000000000.0,2/6/19,Central Lovely Rm in Victorian Home,Well-appointed room with a view of the garden ...,"Centrally-located lovely, quiet home on tree-l...",Well-appointed room with a view of the garden ...
2,moderate,10.0,13188,https://www.airbnb.com/rooms/13188,20200000000000.0,2/6/19,Garden level studio in ideal loc.,Garden level studio suite with garden patio - ...,"Explore Vancouver in a highly sought after, tr...",Garden level studio suite with garden patio - ...
3,strict_14_with_grace_period,8.0,13357,https://www.airbnb.com/rooms/13357,20200000000000.0,2/6/19,! Wow! 2bed 2bath 1bed den Harbour View Apartm...,Very spacious and comfortable with very well k...,"Mountains and harbour view 2 bedroom,2 bath,1 ...",Very spacious and comfortable with very well k...
4,strict_14_with_grace_period,9.0,13358,https://www.airbnb.com/rooms/13358,20200000000000.0,2/6/19,Urban Boutique Suite heart of Downtown Vancouver!,,Welcome to the Electra Building on Nelson stre...,Welcome to the Electra Building on Nelson stre...


### Applying
apply()

- apply an arbitrary function to each group, and stitch the results back for each apply() into a single dataframe where the index is preserved

In [114]:
df = pd.read_csv('airbnb_listings.csv')
df = df[['cancellation_policy', 'review_scores_value']]

df.head()


Unnamed: 0,cancellation_policy,review_scores_value
0,strict_14_with_grace_period,9.0
1,strict_14_with_grace_period,9.0
2,moderate,10.0
3,strict_14_with_grace_period,8.0
4,strict_14_with_grace_period,9.0


- Wrapping the transform() and merge() processes in one step, with apply()

In [116]:
def calc_mean_review_scores(group) :   # group is a dataframe, grouped by 'cancellation policy' in this case
    avg = np.nanmean(group["review_scores_value"])
    group["review_scores_mean"] = np.abs(avg - group["review_scores_value"])
    
    return group

df.groupby('cancellation_policy').apply(calc_mean_review_scores).head()
    

Unnamed: 0,cancellation_policy,review_scores_value,review_scores_mean
0,strict_14_with_grace_period,9.0,0.354276
1,strict_14_with_grace_period,9.0,0.354276
2,moderate,10.0,0.467794
3,strict_14_with_grace_period,8.0,1.354276
4,strict_14_with_grace_period,9.0,0.354276


- Using apply can be slower than using some of the specialized functions, especially agg().
- But, if your dataframes are not huge, it's a soild general purpose approach 

## 4. Scales

#### Ratio Scale

- units are equally spaced
- mathematical operations of +-/* are valid
- e.g. height and weight

#### Interval Scale

- units are equally spaced
- no clear absence of value (i.e. there is no true zero, zero means something, not empty)
- operations such as multiplication and division are not valid
- e.g. temperature measured in Celsius or Fahrenheit (as there is never an absence of temperature. zero degrees is actually an meaningful value itself)
- e.g. direction on a compass (zero degrees on the compass doesn't indicate a lack of direction, but instead describes a direction itself)

#### Ordinal Scale

- the order  of the units is important, but not evenly spaced
- e.g. letter grades such as A+(3%), A(4%), A-(3%)

#### Nominal Scale

- categories of data, but the categories have no order with respect to one another
- e.g. teams of a sport

In [123]:
import pandas as pd

df = pd.DataFrame( ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
                   index = ['excellent', 'excellent', 'excellent', 'good', 'good', 'good',
                           'ok', 'ok', 'ok', 'poor', 'poor'],
                   columns = ['Grades'])

df


Unnamed: 0,Grades
excellent,A+
excellent,A
excellent,A-
good,B+
good,B
good,B-
ok,C+
ok,C
ok,C-
poor,D+


In [124]:
# Now, if we check the datatype of this column, we see that it's just an object, since we set string values
df.dtypes


Grades    object
dtype: object

#### .astype(' ')

In [126]:
# tell pandas to change the type to category 
df['Grades'].astype('category')


excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): [A, A+, A-, B, ..., C+, C-, D, D+]

#### .CategoricalDtype(categories = ['z', 'y', ... ], ordered = True/False )

In [128]:
# We can tell pandas that the data is ordered by first creating a new categorical data type with the list of
# the categories (in order) and the ordered = True flag
my_categories = pd.CategoricalDtype(categories = ['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
                                    ordered = True)

grades = df['Grades'].astype(my_categories)
grades


excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
ok            C
ok           C-
poor         D+
poor          D
Name: Grades, dtype: category
Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]

In [133]:
# A category with orders allows comparisons (boolean masking) or mathematical opeartions (min, max)
# Here, the df's 'Grades' column is not ordered, so
df[df['Grades'] > 'C'] 


Unnamed: 0,Grades
ok,C+
ok,C-
poor,D+
poor,D


In [131]:
# However, the grades dataframe is categorically ordered, so
grades[grades > "C"]


excellent    A+
excellent     A
excellent    A-
good         B+
good          B
good         B-
ok           C+
Name: Grades, dtype: category
Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]

#### Converting interval or ratio scale (numeric) to

In [159]:
# Let's go back to our census data. We saw that we could group by state, then aggregate to get a list of the
# average county size by state. If we further apply cut to this with, say, ten bins, we can see that states
# listed as categoricals using the average county size.

import numpy as np

df = pd.read_csv('census.txt')
df = df[df['SUMLEV'] == 50]

df = df.set_index('STNAME').groupby(level = 0)['CENSUS2010POP'].agg(np.average)

df.head()


STNAME
Alabama        71339.343284
Alaska         24490.724138
Arizona       426134.466667
Arkansas       38878.906667
California    642309.586207
Name: CENSUS2010POP, dtype: float64

#### .cut( )

- takes an argument as some array-like structure, such as a column of a dataframe or a series
- also takes a number of bits to be used, and all bins arekept at equal spacing

In [162]:
# Now, if we just want to make 'bins' of each of these, we can use cut()
pd.cut(df, 10).head()         # Now, all states are categorized as 10 bins. 
                              # Alabama and Alaska belong to the same bin.


STNAME
Alabama         (11706.087, 75333.413]
Alaska          (11706.087, 75333.413]
Arizona       (390320.176, 453317.529]
Arkansas        (11706.087, 75333.413]
California    (579312.234, 642309.586]
Name: CENSUS2010POP, dtype: category
Categories (10, interval[float64]): [(11706.087, 75333.413] < (75333.413, 138330.766] < (138330.766, 201328.118] < (201328.118, 264325.471] ... (390320.176, 453317.529] < (453317.529, 516314.881] < (516314.881, 579312.234] < (579312.234, 642309.586]]

## 5. Pivot Table

- a way of summarizing data in a DataFrame for a particular purpose
- makes heavy use of the aggregation function
- a pivot table is itself a DataFrame
  - where the rows represent one variable that you're interested in, the columns another, and the cell's some aggregate value
- tends to include marginal values, which are the sums for each column and row
  - this allows you to be able to see the relationship between two variables at just a glance
 

In [163]:
import pandas as pd
import numpy as np


In [254]:
df = pd.read_csv('cwurData.txt')
df.head()


Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [255]:
def create_category(ranking) :
    if (ranking >= 1) & (ranking <= 100) :
        return 'First Tier Top University'
    elif (ranking >= 101) & (ranking <= 200) :
        return 'Second Tier Top University'
    elif (ranking >= 201) & (ranking <= 300) :
        return 'Third Tier Top University'
    return 'Other Top University'

# df.drop(['Rank_Level'], axis = 'columns', inplace = True)
df['rank_level'] = df['world_rank'].apply(lambda x: create_category(x))

df.head()


Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year,rank_level
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012,First Tier Top University
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012,First Tier Top University
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012,First Tier Top University
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012,First Tier Top University
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012,First Tier Top University


#### .pivot_table(values = , index = , columns = , aggfunc = [np.function1, np.function2, ... ], margins = True/False)

In [256]:
df.pivot_table(values = 'score', index = 'country', columns = 'rank_level', aggfunc = [np.mean]).head()


Unnamed: 0_level_0,mean,mean,mean,mean
rank_level,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Argentina,,44.672857,,
Australia,47.9425,44.64575,49.2425,47.285
Austria,,44.864286,,47.066667
Belgium,51.875,45.081,49.084,46.746667
Brazil,,44.499706,49.565,


In [257]:
df.pivot_table(values = 'score', index = 'country', columns = 'rank_level', aggfunc = [np.mean, np.max]).head()


Unnamed: 0_level_0,mean,mean,mean,mean,amax,amax,amax,amax
rank_level,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Argentina,,44.672857,,,,45.66,,
Australia,47.9425,44.64575,49.2425,47.285,51.61,45.97,50.4,47.47
Austria,,44.864286,,47.066667,,46.29,,47.78
Belgium,51.875,45.081,49.084,46.746667,52.03,46.21,49.73,47.14
Brazil,,44.499706,49.565,,,46.08,49.82,


In [258]:
df.pivot_table(values = 'score', index = 'country', columns = 'rank_level', aggfunc = [np.mean, np.max],
               margins = True).head()


Unnamed: 0_level_0,mean,mean,mean,mean,mean,amax,amax,amax,amax,amax
rank_level,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,All,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,All
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Argentina,,44.672857,,,44.672857,,45.66,,,45.66
Australia,47.9425,44.64575,49.2425,47.285,45.825517,51.61,45.97,50.4,47.47,51.61
Austria,,44.864286,,47.066667,45.139583,,46.29,,47.78,47.78
Belgium,51.875,45.081,49.084,46.746667,47.011,52.03,46.21,49.73,47.14,52.03
Brazil,,44.499706,49.565,,44.781111,,46.08,49.82,,49.82


- Accessing columns and indexes in pivot tables

In [259]:
new_df = df.pivot_table(values = 'score', index = 'country', columns = 'rank_level', aggfunc = [np.mean, np.max],
               margins = True)

print(new_df.index)


Index(['Argentina', 'Australia', 'Austria', 'Belgium', 'Brazil', 'Bulgaria',
       'Canada', 'Chile', 'China', 'Colombia', 'Croatia', 'Cyprus',
       'Czech Republic', 'Denmark', 'Egypt', 'Estonia', 'Finland', 'France',
       'Germany', 'Greece', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Iran',
       'Ireland', 'Israel', 'Italy', 'Japan', 'Lebanon', 'Lithuania',
       'Malaysia', 'Mexico', 'Netherlands', 'New Zealand', 'Norway', 'Poland',
       'Portugal', 'Puerto Rico', 'Romania', 'Russia', 'Saudi Arabia',
       'Serbia', 'Singapore', 'Slovak Republic', 'Slovenia', 'South Africa',
       'South Korea', 'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand',
       'Turkey', 'USA', 'Uganda', 'United Arab Emirates', 'United Kingdom',
       'Uruguay', 'All'],
      dtype='object', name='country')


In [260]:
# columns are hierarchical
print(new_df.columns)

# Top level columns = 'mean' 'amax'
# Lower Level columns = 'rank_level'


MultiIndex([('mean',  'First Tier Top University'),
            ('mean',       'Other Top University'),
            ('mean', 'Second Tier Top University'),
            ('mean',  'Third Tier Top University'),
            ('mean',                        'All'),
            ('amax',  'First Tier Top University'),
            ('amax',       'Other Top University'),
            ('amax', 'Second Tier Top University'),
            ('amax',  'Third Tier Top University'),
            ('amax',                        'All')],
           names=[None, 'rank_level'])


In [261]:
new_df2 = new_df['mean']['First Tier Top University'].head()

print(type(new_df2)) # When you project a single column of values out of a DataFrame, you get a series.

new_df2


<class 'pandas.core.series.Series'>


country
Argentina        NaN
Australia    47.9425
Austria          NaN
Belgium      51.8750
Brazil           NaN
Name: First Tier Top University, dtype: float64

#### .idxmax()

- built in function to the Series object, not pivot tables' special function

In [262]:
new_df['mean']['First Tier Top University'].idxmax()


'United Kingdom'

### Stacking and Unstacking

In [263]:
new_df.head()
# (inner-most) -> country -> min, amax -> rank -> (lower-most)


Unnamed: 0_level_0,mean,mean,mean,mean,mean,amax,amax,amax,amax,amax
rank_level,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,All,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,All
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Argentina,,44.672857,,,44.672857,,45.66,,,45.66
Australia,47.9425,44.64575,49.2425,47.285,45.825517,51.61,45.97,50.4,47.47,51.61
Austria,,44.864286,,47.066667,45.139583,,46.29,,47.78,47.78
Belgium,51.875,45.081,49.084,46.746667,47.011,52.03,46.21,49.73,47.14,52.03
Brazil,,44.499706,49.565,,44.781111,,46.08,49.82,,49.82


#### .stack( )

- pivoting the lower-most column index to become the inner-most row index

In [264]:
# (inner-most) -> country -> min, amax -> rank -> (lower-most)

new_df = new_df.stack()    # lower-most column index = rank_level
new_df.head()              # 'Third Tier Top University' 'All'

# (inner-most) -> rank -> country -> min, amax -> (lower-most)


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,amax
country,rank_level,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,Other Top University,44.672857,45.66
Argentina,All,44.672857,45.66
Australia,First Tier Top University,47.9425,51.61
Australia,Other Top University,44.64575,45.97
Australia,Second Tier Top University,49.2425,50.4


#### .unstack( )

- pivoting the inner-most column index to become the lower-most column index

In [265]:
# (inner-most) -> rank -> country -> min, amax -> (lower-most)

new_df = new_df.unstack().head()
# (inner-most) -> country -> min, amax -> rank -> (lower-most)

new_df


Unnamed: 0_level_0,mean,mean,mean,mean,mean,amax,amax,amax,amax,amax
rank_level,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,All,First Tier Top University,Other Top University,Second Tier Top University,Third Tier Top University,All
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Argentina,,44.672857,,,44.672857,,45.66,,,45.66
Australia,47.9425,44.64575,49.2425,47.285,45.825517,51.61,45.97,50.4,47.47,51.61
Austria,,44.864286,,47.066667,45.139583,,46.29,,47.78,47.78
Belgium,51.875,45.081,49.084,46.746667,47.011,52.03,46.21,49.73,47.14,52.03
Brazil,,44.499706,49.565,,44.781111,,46.08,49.82,,49.82


#### What happens if we unstack twice in a row?

In [270]:
# (inner-most) -> country -> min, amax -> rank -> (lower-most)

new_df.unstack().head(10)
# (inner-most) -> min, amax -> rank -> country -> (lower-most)

# We ended up unstacking all the way to just a single column, so a series object is returned.
# This column is just a "value", the meaning of which is denoted by the heirarchical index of 
# opeartion(mean, amax), rank, and country.


      rank_level                 country  
mean  First Tier Top University  Argentina          NaN
                                 Australia    47.942500
                                 Austria            NaN
                                 Belgium      51.875000
                                 Brazil             NaN
      Other Top University       Argentina    44.672857
                                 Australia    44.645750
                                 Austria      44.864286
                                 Belgium      45.081000
                                 Brazil       44.499706
dtype: float64

## 6. Date/Time Functionality

#### 4 time-related classes in Pandas

- Timestamp
- DatetimeIndex
- Period
- PeriodIndex


In [271]:
import pandas as pd
import numpy as np


### Timestamp

- represents a single timestamp and associates values with points in time

In [272]:
pd.Timestamp('9/1/2019 10:05 AM')


Timestamp('2019-09-01 10:05:00')

In [273]:
pd.Timestamp(2019, 12, 20, 1, 19)


Timestamp('2019-12-20 01:19:00')

#### isoweekday( )

- 0: Monday
- ...
- 6: Sunday

In [274]:
pd.Timestamp(2019, 12, 20, 0, 0).isoweekday()


5

#### .year .month .day .hour .minute .second

In [277]:
pd.Timestamp(2019, 12, 20, 5, 2, 23).second


23

### Period

- represents a single time span, such as a specific day or month

In [278]:
pd.Period('1/2016')

# M is the first finest grained piece we provided (Y -> M)


Period('2016-01', 'M')

In [279]:
pd.Period('3/5/2016')

# D is the first finest grained piece we provided (Y -> M -> D)


Period('2016-03-05', 'D')

- arithmetic on Period is very easy

In [280]:
pd.Period('1/2016') + 5


Period('2016-06', 'M')

In [281]:
pd.Period('3/5/2016') - 2


Period('2016-03-03', 'D')

### DatetimeIndex and PeriodIndex

- DatetimeIndex

In [282]:
t1 = pd.Series(list('abc'), 
               [pd.Timestamp('2016-09-01'), pd.Timestamp('2016-09-02'), pd.Timestamp('2016-09-03')]
              )
t1


2016-09-01    a
2016-09-02    b
2016-09-03    c
dtype: object

In [283]:
type(t1.index)     # DatetimeIndex


pandas.core.indexes.datetimes.DatetimeIndex

- PeriodIndex

In [285]:
t2 = pd.Series(list('def'),
               [pd.Period('2016-09'), pd.Period('2016-10'), pd.Period('2016-11')])

t2


2016-09    d
2016-10    e
2016-11    f
Freq: M, dtype: object

In [286]:
type(t2.index)


pandas.core.indexes.period.PeriodIndex

### Converting to Datetime

In [287]:
d1 = ['2 June 2013', 'Aug 29, 2014', '2015-06-26', '7/12/16']

# some random data
ts3 = pd.DataFrame(np.random.randint(10, 100, (4, 2)), index = d1, columns = list('ab'))

ts3


Unnamed: 0,a,b
2 June 2013,86,42
"Aug 29, 2014",37,80
2015-06-26,24,70
7/12/16,54,97


#### to_datetime( )

In [288]:
ts3.index = pd.to_datetime(ts3.index)
ts3


Unnamed: 0,a,b
2013-06-02,86,42
2014-08-29,37,80
2015-06-26,24,70
2016-07-12,54,97


to_datetime( 'date', dayfirst = True/False)

In [289]:
pd.to_datetime('4.7.12', dayfirst = True)


Timestamp('2012-07-04 00:00:00')

### Timedelta

- represents differences in times
- not the same as a period, but conceptually similar

In [290]:
pd.Timestamp('9/3/2016') - pd.Timestamp('9/1/2016')


Timedelta('2 days 00:00:00')

In [291]:
pd.Timestamp('9/2/2016 8:10 AM') + pd.Timedelta('12D 3H')


Timestamp('2016-09-14 11:10:00')

### Offset

- similar to timedata, but it follows specific calendar duration rules
- allows flexibility in terms of types of time intervals
- it has business day, end of month, semi month begin etc., besides hour, day, week, month, etc.

In [292]:
pd.Timestamp('9/4/2016').weekday()    # 6 = saturday


6

#### offsets.Week( )

In [293]:
pd.Timestamp('9/4/2016') + pd.offsets.Week()


Timestamp('2016-09-11 00:00:00')

#### offsets.MonthEnd( )

In [294]:
pd.Timestamp('9/4/2016') + pd.offsets.MonthEnd()


Timestamp('2016-09-30 00:00:00')

### Working with Dates in a DataFrame

#### data_range('date', periods = , freq = '')

- we have to either specify the start or end date
- if not specified, by default, the date is considered the start date
- then we have to specify number of periods, and a frequency

In [306]:
# freq = bi-weekly on Sunday
dates = pd.date_range('10-01-2016', periods = 9, freq = '2W-SUN')   # saturday
dates


6

In [297]:
# freq = business day
pd.date_range('10-01-2016', periods = 9, freq = 'B')


DatetimeIndex(['2016-10-03', '2016-10-04', '2016-10-05', '2016-10-06',
               '2016-10-07', '2016-10-10', '2016-10-11', '2016-10-12',
               '2016-10-13'],
              dtype='datetime64[ns]', freq='B')

In [298]:
# freq = quarter start in June
pd.date_range('04-01-2016', periods = 12, freq = 'QS-JUN')


DatetimeIndex(['2016-06-01', '2016-09-01', '2016-12-01', '2017-03-01',
               '2017-06-01', '2017-09-01', '2017-12-01', '2018-03-01',
               '2018-06-01', '2018-09-01', '2018-12-01', '2019-03-01'],
              dtype='datetime64[ns]', freq='QS-JUN')

#### Example:

In [309]:
dates = pd.date_range('10-01-2016', periods = 9, freq = '2W-SUN')

df = pd.DataFrame({'Count 1': 100 + np.random.randint(-5, 10, 9).cumsum(),
                   'Count 2': 120 + np.random.randint(-5, 10, 9)},
                  index = dates)

df


Unnamed: 0,Count 1,Count 2
2016-10-02,107,128
2016-10-16,110,118
2016-10-30,109,119
2016-11-13,105,115
2016-11-27,113,120
2016-12-11,111,122
2016-12-25,108,120
2017-01-08,109,117
2017-01-22,108,115


#### .weekday_name

In [310]:
# First, check what day of the week a specific date is
df.index.weekday_name


Index(['Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday',
       'Sunday', 'Sunday'],
      dtype='object')

#### .diff( )

In [312]:
# find the difference between each date's value (e.g. column1's index 1 value - index 2 value)
df.diff()


Unnamed: 0,Count 1,Count 2
2016-10-02,,
2016-10-16,3.0,-10.0
2016-10-30,-1.0,1.0
2016-11-13,-4.0,-4.0
2016-11-27,8.0,5.0
2016-12-11,-2.0,2.0
2016-12-25,-3.0,-2.0
2017-01-08,1.0,-3.0
2017-01-22,-1.0,-2.0


#### .resample(' ')

In [314]:
df.resample('M').mean()


Unnamed: 0,Count 1,Count 2
2016-10-31,108.666667,121.666667
2016-11-30,109.0,117.5
2016-12-31,109.5,121.0
2017-01-31,108.5,116.0


#### datetime indexing and slicing

In [315]:
df['2017']


Unnamed: 0,Count 1,Count 2
2017-01-08,109,117
2017-01-22,108,115


In [316]:
df['2016-12']


Unnamed: 0,Count 1,Count 2
2016-12-11,111,122
2016-12-25,108,120


In [317]:
df['2016-12': ]     # onwards


Unnamed: 0,Count 1,Count 2
2016-12-11,111,122
2016-12-25,108,120
2017-01-08,109,117
2017-01-22,108,115
