# Essential DataFrame Operations

In [1]:
import pandas as pd
import numpy as np


## Introduction

## Selecting Multiple DataFrame Columns

### How to do it\...

In [2]:
movies = pd.read_csv('datasets/movie.csv')
movie_actor_director = movies[['actor_1_name', 'actor_2_name',
    'actor_3_name', 'director_name']]
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [3]:
type(movies[['director_name']])

pandas.core.frame.DataFrame

In [4]:
type(movies['director_name'])

pandas.core.series.Series

In [5]:
type(movies.loc[:, ['director_name']])

pandas.core.frame.DataFrame

In [6]:
type(movies.loc[:, 'director_name'])

pandas.core.series.Series

In [7]:
cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movie_actor_director = movies[cols]

In [8]:
movies[['actor_1_name', 'actor_2_name','actor_3_name', 'director_name']]

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker
...,...,...,...,...
5038,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith
5039,Natalie Zea,Valorie Curry,Sam Underwood,
5040,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds
5041,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia


## Selecting Columns with Methods

### How it works\...

In [9]:
movies = pd.read_csv('datasets/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)
#movies.get_dtype_counts()

In [10]:
movies.select_dtypes(include='int').head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [11]:
movies.select_dtypes(include='number').head()

Unnamed: 0,num_critic,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [12]:
movies.select_dtypes(include=['int', 'object']).head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [13]:
movies.select_dtypes(exclude='float').head()

Unnamed: 0,color,director_name,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,cast_total_fb,actor_3_name,plot_keywords,movie_imdb_link,language,country,content_rating,movie_fb
0,Color,James Cameron,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13,33000
1,Color,Gore Verbinski,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13,0
2,Color,Sam Mendes,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,11700,Stephanie Sigman,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13,85000
3,Color,Christopher Nolan,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,106759,Joseph Gordon-Levitt,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13,164000
4,,Doug Walker,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,143,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,0


In [14]:
movies.filter(like='fb').head()

Unnamed: 0,director_fb,actor_3_fb,actor_1_fb,cast_total_fb,actor_2_fb,movie_fb
0,0.0,855.0,1000.0,4834,936.0,33000
1,563.0,1000.0,40000.0,48350,5000.0,0
2,0.0,161.0,11000.0,11700,393.0,85000
3,22000.0,23000.0,27000.0,106759,23000.0,164000
4,131.0,,131.0,143,12.0,0


In [15]:
cols = ['actor_1_name', 'actor_2_name',
        'actor_3_name', 'director_name']
movies.filter(items=cols).head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [16]:
movies.filter(regex=r'\d').head()

Unnamed: 0,actor_3_fb,actor_2_name,actor_1_fb,actor_1_name,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
1,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
2,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
3,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,131.0,Doug Walker,,12.0


## Ordering Column Names

### How to do it\...

In [17]:
movies = pd.read_csv('datasets/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)

In [18]:
movies.columns

Index(['color', 'director_name', 'num_critic', 'duration', 'director_fb',
       'actor_3_fb', 'actor_2_name', 'actor_1_fb', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_fb',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_fb', 'imdb_score', 'aspect_ratio',
       'movie_fb'],
      dtype='object')

In [19]:
cat_core = ['movie_title', 'title_year',
            'content_rating', 'genres']
cat_people = ['director_name', 'actor_1_name',
              'actor_2_name', 'actor_3_name']
cat_other = ['color', 'country', 'language',
             'plot_keywords', 'movie_imdb_link']
cont_fb = ['director_fb', 'actor_1_fb',
           'actor_2_fb', 'actor_3_fb',
           'cast_total_fb', 'movie_fb']
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_voted_users', 'num_user',
                    'num_critic']
cont_other = ['imdb_score', 'duration',
               'aspect_ratio', 'facenumber_in_poster']

In [20]:
new_col_order = cat_core + cat_people + \
                cat_other + cont_fb + \
                cont_finance + cont_num_reviews + \
                cont_other
set(movies.columns) == set(new_col_order)

True

In [21]:
movies[new_col_order].head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,...,movie_fb,budget,gross,num_voted_users,num_user,num_critic,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,...,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,...,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,...,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,...,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens ...,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,...,0,,,8,,,7.1,,,0.0


## Summarizing a DataFrame

### How to do it\...

In [22]:
movies = pd.read_csv('datasets/movie.csv')
movies.shape

(5043, 28)

In [23]:
movies.size

141204

In [24]:
movies.ndim

2

In [25]:
len(movies)

5043

In [26]:
movies.count()

color                        5024
director_name                4939
num_critic_for_reviews       4993
duration                     5028
director_facebook_likes      4939
actor_3_facebook_likes       5020
actor_2_name                 5030
actor_1_facebook_likes       5036
gross                        4159
genres                       5043
actor_1_name                 5036
movie_title                  5043
num_voted_users              5043
cast_total_facebook_likes    5043
actor_3_name                 5020
facenumber_in_poster         5030
plot_keywords                4890
movie_imdb_link              5043
num_user_for_reviews         5022
language                     5031
country                      5038
content_rating               4740
budget                       4551
title_year                   4935
actor_2_facebook_likes       5030
imdb_score                   5043
aspect_ratio                 4714
movie_facebook_likes         5043
dtype: int64

In [27]:
movies.min()

  movies.min()


num_critic_for_reviews                                                     1.0
duration                                                                   7.0
director_facebook_likes                                                    0.0
actor_3_facebook_likes                                                     0.0
actor_1_facebook_likes                                                     0.0
gross                                                                    162.0
genres                                                                  Action
movie_title                                                           #Horror 
num_voted_users                                                              5
cast_total_facebook_likes                                                    0
facenumber_in_poster                                                       0.0
movie_imdb_link              http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user_for_reviews                                

In [28]:
movies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4993.0,140.1943,121.6017,1.0,50.0,110.0,195.0,813.0
duration,5028.0,107.2011,25.19744,7.0,93.0,103.0,118.0,511.0
director_facebook_likes,4939.0,686.5092,2813.329,0.0,7.0,49.0,194.5,23000.0
actor_3_facebook_likes,5020.0,645.0098,1665.042,0.0,133.0,371.5,636.0,23000.0
actor_1_facebook_likes,5036.0,6560.047,15020.76,0.0,614.0,988.0,11000.0,640000.0
gross,4159.0,48468410.0,68452990.0,162.0,5340987.5,25517500.0,62309437.5,760505800.0
num_voted_users,5043.0,83668.16,138485.3,5.0,8593.5,34359.0,96309.0,1689764.0
cast_total_facebook_likes,5043.0,9699.064,18163.8,0.0,1411.0,3090.0,13756.5,656730.0
facenumber_in_poster,5030.0,1.371173,2.013576,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,5022.0,272.7708,377.9829,1.0,65.0,156.0,326.0,5060.0


In [29]:
movies.describe(percentiles=[.01, .3, .99]).T

Unnamed: 0,count,mean,std,min,1%,30%,50%,99%,max
num_critic_for_reviews,4993.0,140.1943,121.6017,1.0,2.0,61.0,110.0,548.08,813.0
duration,5028.0,107.2011,25.19744,7.0,43.0,95.0,103.0,189.0,511.0
director_facebook_likes,4939.0,686.5092,2813.329,0.0,0.0,11.0,49.0,16000.0,23000.0
actor_3_facebook_likes,5020.0,645.0098,1665.042,0.0,0.0,177.0,371.5,11000.0,23000.0
actor_1_facebook_likes,5036.0,6560.047,15020.76,0.0,7.0,700.0,988.0,44000.0,640000.0
gross,4159.0,48468410.0,68452990.0,162.0,8497.8,8131359.8,25517500.0,333573600.0,760505800.0
num_voted_users,5043.0,83668.16,138485.3,5.0,54.26,12237.8,34359.0,681094.3,1689764.0
cast_total_facebook_likes,5043.0,9699.064,18163.8,0.0,6.42,1723.2,3090.0,63027.24,656730.0
facenumber_in_poster,5030.0,1.371173,2.013576,0.0,0.0,0.0,1.0,8.0,43.0
num_user_for_reviews,5022.0,272.7708,377.9829,1.0,1.21,81.3,156.0,2010.11,5060.0


In [30]:
movies.min(skipna=False)

  movies.min(skipna=False)


num_critic_for_reviews                                                     NaN
duration                                                                   NaN
director_facebook_likes                                                    NaN
actor_3_facebook_likes                                                     NaN
actor_1_facebook_likes                                                     NaN
gross                                                                      NaN
genres                                                                  Action
movie_title                                                           #Horror 
num_voted_users                                                              5
cast_total_facebook_likes                                                    0
facenumber_in_poster                                                       NaN
movie_imdb_link              http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user_for_reviews                                

## Chaining DataFrame Methods

### How to do it\...

In [31]:
movies = pd.read_csv('datasets/movie.csv')
def shorten(col):
    return (col.replace('facebook_likes', 'fb')
               .replace('_for_reviews', '')
    )
movies = movies.rename(columns=shorten)
movies.isnull().head()

Unnamed: 0,color,director_name,num_critic,duration,director_fb,actor_3_fb,actor_2_name,actor_1_fb,gross,genres,...,num_user,language,country,content_rating,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,True,False,False,True,False,...,True,True,True,True,True,True,False,False,True,False


In [32]:
(movies
   .isnull()
   .sum()
   .head()
)

color             19
director_name    104
num_critic        50
duration          15
director_fb      104
dtype: int64

In [33]:
movies.isnull().sum().sum()

2698

In [34]:
movies.isnull().any().any()

True

In [35]:
# movies.isnull().get_dtype_counts()

### There\'s more\...

In [36]:
movies[['color', 'movie_title', 'color']].max()

  movies[['color', 'movie_title', 'color']].max()


movie_title    Æon Flux 
dtype: object

In [37]:
with pd.option_context('max_colwidth', 20):
    movies.select_dtypes(['object']).fillna('').max()

In [38]:
with pd.option_context('max_colwidth', 20):
    (movies
        .select_dtypes(['object'])
        .fillna('')
        .max()
    )

## DataFrame Operations

In [39]:
colleges = pd.read_csv('datasets/college.csv')

In [40]:
colleges = pd.read_csv('datasets/college.csv', index_col='INSTNM')
college_ugds = colleges.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [41]:
name = 'Northwest-Shoals Community College'
college_ugds.loc[name]

UGDS_WHITE    0.7912
UGDS_BLACK    0.1250
UGDS_HISP     0.0339
UGDS_ASIAN    0.0036
UGDS_AIAN     0.0088
UGDS_NHPI     0.0006
UGDS_2MOR     0.0012
UGDS_NRA      0.0033
UGDS_UNKN     0.0324
Name: Northwest-Shoals Community College, dtype: float64

In [42]:
college_ugds.loc[name].round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.12
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [43]:
(college_ugds.loc[name] + .0001).round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.13
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [44]:
college_ugds + .00501

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03831,0.94031,0.01051,0.00691,0.00741,0.00691,0.00501,0.01091,0.01881
University of Alabama at Birmingham,0.59721,0.26501,0.03331,0.05681,0.00721,0.00571,0.04181,0.02291,0.01501
Amridge University,0.30401,0.42421,0.01191,0.00841,0.00501,0.00501,0.00501,0.00501,0.27651
University of Alabama in Huntsville,0.70381,0.13051,0.04321,0.04261,0.01931,0.00521,0.02221,0.03821,0.04001
Alabama State University,0.02081,0.92581,0.01711,0.00691,0.00601,0.00561,0.01481,0.02931,0.01871
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [45]:
(college_ugds + .00501) // .01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,3.0,94.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
University of Alabama at Birmingham,59.0,26.0,3.0,5.0,0.0,0.0,4.0,2.0,1.0
Amridge University,30.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0
University of Alabama in Huntsville,70.0,13.0,4.0,4.0,1.0,0.0,2.0,3.0,4.0
Alabama State University,2.0,92.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [46]:
college_ugds_op_round = (college_ugds + .00501) // .01 / 100
college_ugds_op_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.0,0.0,0.0,0.01,0.02,0.01


In [47]:
college_ugds_round = (college_ugds + .00001).round(2)
college_ugds_round

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.00,0.00,0.0,0.00,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.00,0.0,0.04,0.02,0.01
Amridge University,0.30,0.42,0.01,0.00,0.00,0.0,0.00,0.00,0.27
University of Alabama in Huntsville,0.70,0.13,0.04,0.04,0.01,0.0,0.02,0.03,0.04
Alabama State University,0.02,0.92,0.01,0.00,0.00,0.0,0.01,0.02,0.01
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,,,,,,,
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,


In [48]:
college_ugds_op_round.equals(college_ugds_round)

True

In [49]:
college2 = (college_ugds
    .add(.00501) 
    .floordiv(.01) 
    .div(100)
)
college2.equals(college_ugds_op_round)

True

## Comparing Missing Values

In [50]:
np.nan == np.nan

False

In [51]:
None == None

True

In [52]:
np.nan > 5

False

In [53]:
5 > np.nan

False

In [54]:
np.nan != 5

True

### Getting ready

In [55]:
college = pd.read_csv('datasets/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [56]:
college_ugds == .0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,False,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [57]:
college_self_compare = college_ugds == college_ugds
college_self_compare.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,True,True,True,True,True
Alabama State University,True,True,True,True,True,True,True,True,True


In [58]:
college_self_compare.all()

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [59]:
(college_ugds == np.nan).sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [60]:
college_ugds.isnull().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [61]:
college_ugds.equals(college_ugds)

True

### There\'s more\...

In [62]:
college_ugds.eq(.0019)    # same as college_ugds == .0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,False,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False


In [63]:
from pandas.testing import assert_frame_equal
assert_frame_equal(college_ugds, college_ugds) is None

True

## Transposing the direction of a DataFrame operation

### How to do it\...

In [64]:
college = pd.read_csv('datasets/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [65]:
college_ugds.count()

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [66]:
college_ugds.count(axis='columns').head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [67]:
college_ugds.sum(axis='columns').head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [68]:
college_ugds.median(axis='index')

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

### There\'s more\...

In [69]:
college_ugds_cumsum = college_ugds.cumsum(axis=1)
college_ugds_cumsum.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,0.9784,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.962,0.9863,1.0


## Determining college campus diversity

In [70]:
pd.read_csv('datasets/college_diversity.csv', index_col='School')

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74
"University of Houston Houston, TX",0.74
"University of Nevada--Las Vegas Las Vegas, NV",0.74
"University of San Francisco San Francisco, CA",0.74
"San Francisco State University San Francisco, CA",0.73
"University of Illinois--Chicago Chicago, IL",0.73
"New Jersey Institute of Technology Newark, NJ",0.72
"Texas Woman's University Denton, TX",0.72


### How to do it\...

In [71]:
college = pd.read_csv('datasets/college.csv', index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')

In [72]:
(college_ugds.isnull()
   .sum(axis='columns')
   .sort_values(ascending=False)
   .head()
)

INSTNM
Excel Learning Center-San Antonio South              9
Western State College of Law at Argosy University    9
Albany Law School                                    9
Albany Medical College                               9
A T Still University of Health Sciences              9
dtype: int64

In [73]:
college_ugds = college_ugds.dropna(how='all')
college_ugds.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [74]:
college_ugds.ge(.15)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,True,False,False,False,False,False,False,False
University of Alabama at Birmingham,True,True,False,False,False,False,False,False,False
Amridge University,True,True,False,False,False,False,False,False,True
University of Alabama in Huntsville,True,False,False,False,False,False,False,False,False
Alabama State University,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,True,False,False,False,False,False,False
Hollywood Institute of Beauty Careers-Casselberry,False,True,True,False,False,False,False,False,False
Coachella Valley Beauty College-Beaumont,True,False,True,False,False,False,False,False,False
Dewey University-Mayaguez,False,False,True,False,False,False,False,False,False


In [75]:
diversity_metric = college_ugds.ge(.15).sum(axis='columns')
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [76]:
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [77]:
diversity_metric.sort_values(ascending=False).head()

INSTNM
Central Texas Beauty College-Temple                               5
Regency Beauty Institute-Austin                                   5
Westwood College-O'Hare Airport                                   4
Regency Beauty Institute-Pasadena                                 4
Soma Institute-The National School of Clinical Massage Therapy    4
dtype: int64

In [78]:
college_ugds.loc[['Regency Beauty Institute-Austin',
                   'Central Texas Beauty College-Temple']]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


In [79]:
us_news_top = ['Rutgers University-Newark',
                  'Andrews University',
                  'Stanford University',
                  'University of Houston',
                  'University of Nevada-Las Vegas']
diversity_metric.loc[us_news_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64

### There\'s more\...

In [80]:
(college_ugds
   .max(axis=1)
   .sort_values(ascending=False)
   .head(10)
)

INSTNM
Caribbean University-Ponce                                        1.0
Brighton Institute of Cosmetology                                 1.0
Mesivta Torah Vodaath Rabbinical Seminary                         1.0
Rabbinical College Telshe                                         1.0
University of Puerto Rico-Mayaguez                                1.0
Haskell Indian Nations University                                 1.0
Lake Career and Technical Center                                  1.0
Leon Studio One School of Hair Design & Career Training Center    1.0
Dewey University-Hato Rey                                         1.0
Columbia Central University-Caguas                                1.0
dtype: float64

In [81]:
(college_ugds > .01).all(axis=1).any()

True