In [1]:
import numpy as np 
import pandas as pd

In [32]:
pd.set_option("max_columns", 4, "max_rows", 10)

In [2]:
movies = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/movie.csv')

## Selecting multiple DataFrame columns

In [6]:
movies_actor_director = movies[
    [
    "actor_1_name",
    "actor_2_name",
    "actor_3_name", 
    "director_name",
    ]
    
]

In [4]:
movies_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [7]:
type(movies["director_name"])

pandas.core.series.Series

In [8]:
type(movies[["director_name"]])

pandas.core.frame.DataFrame

In [10]:
# We can also use .loc to pull out a column by name.
type(movies.loc[:, "director_name"])

pandas.core.series.Series

In [12]:
type(movies.loc[:, ["director_name"]])

pandas.core.frame.DataFrame

In [16]:
cols = [
    
        "actor_1_name",
        "actor_2_name",
        "actor_3_name",
        "director_name",
    
]


movies_actor_director = movies[cols]


In [17]:
movies_actor_director 

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker
...,...,...,...,...
4911,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith
4912,Natalie Zea,Valorie Curry,Sam Underwood,
4913,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds
4914,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia


In [18]:
# One of the most common exceptions raised when 
# working with pandas is KeyError.
# This error is mainly due to mistyping of a column
# or index name
cols = movies[
    
        "actor_1_name",
        "actor_2_name",
        "actor_3_name",
        "director_name",
    
]

KeyError: ('actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name')

## Selecting columns with method

In [24]:
def shorten(col):
    return (
        str(col)
        .replace("facebook_likes", "fb")
        .replace("_for_reviews", "")
    )

In [25]:
movies = movies.rename(columns=shorten)

In [26]:
# Use the .get_dtype_counts method to output the 
#number of columns with each specific data type:
movies.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

In [27]:
# Use the .select_dtypes method to select only 
# the integer columns:

movies.select_dtypes(include="int").head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [28]:
# you would like to select all the numeric columns,
# you may pass the string number to the include parameter:
movies.select_dtypes(include="number").head()

Unnamed: 0,num_critic,duration,director_fb,actor_3_fb,actor_1_fb,gross,num_voted_users,cast_total_fb,facenumber_in_poster,num_user,budget,title_year,actor_2_fb,imdb_score,aspect_ratio,movie_fb
0,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
1,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
2,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
3,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


In [33]:
# If we wanted integer and string columns we could
# do the following
movies.select_dtypes(include=["int", "object"]).head()

Unnamed: 0,color,director_name,...,content_rating,movie_fb
0,Color,James Cameron,...,PG-13,33000
1,Color,Gore Verbinski,...,PG-13,0
2,Color,Sam Mendes,...,PG-13,85000
3,Color,Christopher Nolan,...,PG-13,164000
4,,Doug Walker,...,,0


In [34]:
# To exclude only floating-point columns,
# do the following:
movies.select_dtypes(exclude="float").head()

Unnamed: 0,color,director_name,...,content_rating,movie_fb
0,Color,James Cameron,...,PG-13,33000
1,Color,Gore Verbinski,...,PG-13,0
2,Color,Sam Mendes,...,PG-13,85000
3,Color,Christopher Nolan,...,PG-13,164000
4,,Doug Walker,...,,0


In [36]:
# An alternative method to select columns is with 
# the .filter method. This method is flexible and
# searches column names (or index labels) based on
# which parameter is used.
movies.filter(like="fb").head()

Unnamed: 0,director_fb,actor_3_fb,...,actor_2_fb,movie_fb
0,0.0,855.0,...,936.0,33000
1,563.0,1000.0,...,5000.0,0
2,0.0,161.0,...,393.0,85000
3,22000.0,23000.0,...,23000.0,164000
4,131.0,,...,12.0,0


In [37]:
# The .filter method has more tricks (or parameters) 
# up its sleeve. If you use the items parameters,
# you can pass in a list of column names:
cols = [
    "actor_1_name",
    "actor_2_name",
    "actor_3_name",
    "director_name",
]

In [40]:
movies.filter(items=cols).head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [41]:
# The .filter method allows columns to be searched
# with regular expressions using the regex parameter
movies.filter(regex=r"\d").head()

Unnamed: 0,actor_3_fb,actor_2_name,...,actor_3_name,actor_2_fb
0,855.0,Joel David Moore,...,Wes Studi,936.0
1,1000.0,Orlando Bloom,...,Jack Davenport,5000.0
2,161.0,Rory Kinnear,...,Stephanie Sigman,393.0
3,23000.0,Christian Bale,...,Joseph Gordon-Levitt,23000.0
4,,Rob Walker,...,,12.0


In [46]:
movies.select_dtypes(include=np.float64).head()

Unnamed: 0,num_critic,duration,...,imdb_score,aspect_ratio
0,723.0,178.0,...,7.9,1.78
1,302.0,169.0,...,7.1,2.35
2,602.0,148.0,...,6.8,2.35
3,813.0,164.0,...,8.5,2.35
4,,,...,7.1,


In [47]:
movies.select_dtypes(include=np.number).head()

Unnamed: 0,num_critic,duration,...,aspect_ratio,movie_fb
0,723.0,178.0,...,1.78,33000
1,302.0,169.0,...,2.35,0
2,602.0,148.0,...,2.35,85000
3,813.0,164.0,...,2.35,164000
4,,,...,,0


In [48]:
movies.select_dtypes(include=np.object).head()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  movies.select_dtypes(include=np.object).head()


Unnamed: 0,color,director_name,...,country,content_rating
0,Color,James Cameron,...,USA,PG-13
1,Color,Gore Verbinski,...,USA,PG-13
2,Color,Sam Mendes,...,UK,PG-13
3,Color,Christopher Nolan,...,USA,PG-13
4,,Doug Walker,...,,


In [49]:
movies.select_dtypes(include="int64").head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


In [51]:
movies.select_dtypes(include="integer").head()

Unnamed: 0,num_voted_users,cast_total_fb,movie_fb
0,886204,4834,33000
1,471220,48350,0
2,275868,11700,85000
3,1144337,106759,164000
4,8,143,0


## Ordering columns names


In [53]:
def shorten(col):
    return (
        str(col)
        .replace("facebook_likes", "fb")
        .replace("_for_reviews", "")
           )

In [55]:
movies = movies.rename(columns=shorten)

In [79]:
# Output all the column names and scan for similar
# categorical and continuous columns:
movies.columns.value_counts()

color            1
director_name    1
aspect_ratio     1
imdb_score       1
actor_2_fb       1
                ..
actor_3_fb       1
director_fb      1
duration         1
num_critic       1
movie_fb         1
Length: 28, dtype: int64

In [87]:
# The columns don't appear to have any logical ordering 
# to them. Organize the names sensibly into lists
# so that the guideline from the previous section 
# is followed:

cat_core = [
   "movie_title",
   "title_year",
    "content_rating",
    "genres"
    
]


cat_people = [
    "director_name",
   "actor_1_name",
   "actor_2_name",
   "actor_3_name",
    
]

cat_other = [
   "color",
   "country",
   "language",
   "plot_keywords",
   "movie_imdb_link",
]

cont_fb = [
    "director_fb",
    "actor_1_fb",
    "actor_2_fb",
    "actor_3_fb",
    "cast_total_fb",
    "movie_fb",
]

cont_finance = [
    "budget",
    "gross"
]

cont_num_reviews = [
    "num_voted_users",
    "num_user",
    "num_critic",
]

cont_other = [
    "imdb_score",
    "duration",
    "aspect_ratio",
   "facenumber_in_poster",
]

In [88]:
# Concatenate all the lists together to get the
# final column order. Also, ensure that this
# list contains all the columns from the original:
new_col_order = (
       cat_core
     + cat_people
     + cat_other
     + cont_fb
     + cont_finance 
     + cont_num_reviews
     + cont_other
 )

In [89]:
set(movies.columns) == set(new_col_order)

True

In [90]:
# Pass the list with the new column order to the 
# indexing operator of the DataFrame to
# reorder the columns:
movies[new_col_order].head()

Unnamed: 0,movie_title,title_year,...,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,...,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,...,2.35,0.0
2,Spectre,2015.0,...,2.35,1.0
3,The Dark Knight Rises,2012.0,...,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,...,,0.0


## Summarizing a DataFrame

In [91]:
# Read in the movie dataset, and examine the basic
# descriptive properties, .shape, .size, and .ndim,
# along with running the len function:
movies = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/movie.csv')

In [92]:
movies.shape

(4916, 28)

In [94]:
movies.size

137648

In [95]:
movies.ndim

2

In [96]:
len(movies)

4916

In [98]:
# The .count method shows the number of non-missing 
# values for each column. It is an aggregation method
# as it summarizes every column in a single value. 
# The output is a Series that has the original column
# names as its index:

movies.count()

color                      4897
director_name              4814
num_critic_for_reviews     4867
duration                   4901
director_facebook_likes    4814
                           ... 
title_year                 4810
actor_2_facebook_likes     4903
imdb_score                 4916
aspect_ratio               4590
movie_facebook_likes       4916
Length: 28, dtype: int64

In [103]:
# The other methods that compute summary statistics, 
# .min, .max, .mean, .median, and .std, return Series
# that have the column names of the numeric columns
# in the index and their aggregations as the values:
movies.min()

  movies.min()


num_critic_for_reviews        1.0
duration                      7.0
director_facebook_likes       0.0
actor_3_facebook_likes        0.0
actor_1_facebook_likes        0.0
                            ...  
title_year                 1916.0
actor_2_facebook_likes        0.0
imdb_score                    1.6
aspect_ratio                 1.18
movie_facebook_likes            0
Length: 19, dtype: object

In [104]:
movies.mean()

  movies.mean()


num_critic_for_reviews      137.988905
duration                    107.090798
director_facebook_likes     691.014541
actor_3_facebook_likes      631.276313
actor_1_facebook_likes     6494.488491
                              ...     
title_year                 2002.447609
actor_2_facebook_likes     1621.923516
imdb_score                    6.437429
aspect_ratio                  2.222349
movie_facebook_likes       7348.294142
Length: 16, dtype: float64

In [105]:
movies.max()

  movies.max()


num_critic_for_reviews        813.0
duration                      511.0
director_facebook_likes     23000.0
actor_3_facebook_likes      23000.0
actor_1_facebook_likes     640000.0
                             ...   
title_year                   2016.0
actor_2_facebook_likes     137000.0
imdb_score                      9.5
aspect_ratio                   16.0
movie_facebook_likes         349000
Length: 19, dtype: object

In [107]:
# The .describe method is very powerful and calculates
# all the descriptive statistics and quartiles at once.
# The end result is a DataFrame with the descriptive 
# statistics names as its index. I like to transpose 
# the results using .T as I can usually fit more
# information on the screen that way:
movies.describe()

Unnamed: 0,num_critic_for_reviews,duration,...,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,...,4590.0,4916.0
mean,137.988905,107.090798,...,2.222349,7348.294142
std,120.239379,25.286015,...,1.40294,19206.016458
min,1.0,7.0,...,1.18,0.0
25%,49.0,93.0,...,1.85,0.0
50%,108.0,103.0,...,2.35,159.0
75%,191.0,118.0,...,2.35,2000.0
max,813.0,511.0,...,16.0,349000.0


In [108]:
movies.describe().T

Unnamed: 0,count,mean,...,75%,max
num_critic_for_reviews,4867.0,137.988905,...,191.00,813.0
duration,4901.0,107.090798,...,118.00,511.0
director_facebook_likes,4814.0,691.014541,...,189.75,23000.0
actor_3_facebook_likes,4893.0,631.276313,...,633.00,23000.0
actor_1_facebook_likes,4909.0,6494.488491,...,11000.00,640000.0
...,...,...,...,...,...
title_year,4810.0,2002.447609,...,2011.00,2016.0
actor_2_facebook_likes,4903.0,1621.923516,...,912.00,137000.0
imdb_score,4916.0,6.437429,...,7.20,9.5
aspect_ratio,4590.0,2.222349,...,2.35,16.0


In [109]:
# It is possible to specify exact quantiles in the 
# .describe method using the percentiles parameter:
movies.describe(percentiles=[0.01, 0.3, 0.99]).T

Unnamed: 0,count,mean,...,99%,max
num_critic_for_reviews,4867.0,137.988905,...,546.68,813.0
duration,4901.0,107.090798,...,189.00,511.0
director_facebook_likes,4814.0,691.014541,...,16000.00,23000.0
actor_3_facebook_likes,4893.0,631.276313,...,11000.00,23000.0
actor_1_facebook_likes,4909.0,6494.488491,...,44920.00,640000.0
...,...,...,...,...,...
title_year,4810.0,2002.447609,...,2016.00,2016.0
actor_2_facebook_likes,4903.0,1621.923516,...,17000.00,137000.0
imdb_score,4916.0,6.437429,...,8.50,9.5
aspect_ratio,4590.0,2.222349,...,4.00,16.0


In [110]:
# To see how the .skipna parameter affects the outcome, 
# we can set its value to False and rerun step 3 from 
# the preceding recipe. Only numeric columns without
# missing values will calculate a result
movies.min(skipna=False)

  movies.min(skipna=False)


num_critic_for_reviews     NaN
duration                   NaN
director_facebook_likes    NaN
actor_3_facebook_likes     NaN
actor_1_facebook_likes     NaN
                          ... 
title_year                 NaN
actor_2_facebook_likes     NaN
imdb_score                 1.6
aspect_ratio               NaN
movie_facebook_likes         0
Length: 19, dtype: object

## Chaining DataFrame methods

In [175]:
movies = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/movie.csv')

In [114]:
movies = movies.rename(columns=shorten)

In [115]:
# We will use the .isnull method to get a count of the
# missing values. This method will change every value
# to a Boolean, indicating whether it is missing
movies.isnull().head()

Unnamed: 0,color,director_name,...,aspect_ratio,movie_fb
0,False,False,...,False,False
1,False,False,...,False,False
2,False,False,...,False,False
3,False,False,...,False,False
4,True,False,...,True,False


In [125]:
# We will chain the .sum method that interprets True
# and False as 1 and 0, respectively. Because this 
# is a reduction method, it aggregates the results
# into a Series:
movies.isnull().sum().head()

color             19
director_name    102
num_critic        49
duration          15
director_fb      102
dtype: int64

In [126]:
# We can go one step further and take the sum of this
# Series and return the count of the total number of
# missing values in the entire DataFrame 
# as a scalar value
movies.isnull().sum().sum()

2654

In [132]:
# A way to determine whether there are any missing
# values in the DataFrame is to use the 
# .any method twice in succession:
movies.isnull().any().any()

True

In [134]:
# The .isnull method returns a DataFrame the same size
# as the calling DataFrame but with all values 
# transformed to Booleans. See the counts of the
# following data types to verify this
movies.isnull().dtypes.value_counts()

bool    28
dtype: int64

In [137]:
movies[["color", "movie_title", "color"]].max()

  movies[["color", "movie_title", "color"]].max()


movie_title    Æon Flux
dtype: object

In [138]:
# To force pandas to return something for each column,
# we must fill in the missing values. Here,
# we choose an empty string:
movies.select_dtypes(["object"]).fillna("").max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
                                         ...                        
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

In [139]:
# For purposes of readability, method chains are often
# written as one method call per line surrounded 
# by parentheses
(movies.select_dtypes(["object"]).fillna("").max())

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
                                         ...                        
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

In [178]:
(movies.select_dtypes(include="object").fillna("").max())

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
actor_1_name                                           Óscar Jaenada
                                         ...                        
plot_keywords                                    zombie|zombie spoof
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

## DataFrame operations

In [142]:
college = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college.csv')

In [144]:
college + 5

TypeError: can only concatenate str (not "int") to str

In [145]:
colleges = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college.csv', index_col="INSTNM")

In [146]:
college_ugds = colleges.filter(like="UGDS_")

In [148]:
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.0333,0.9353,...,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,...,0.0179,0.01
Amridge University,0.299,0.4192,...,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,...,0.0332,0.035
Alabama State University,0.0158,0.9208,...,0.0243,0.0137


In [158]:
name = "Northwest-Shoals Community College"
college_ugds.loc[name]

UGDS_WHITE    0.7912
UGDS_BLACK    0.1250
UGDS_HISP     0.0339
UGDS_ASIAN    0.0036
UGDS_AIAN     0.0088
UGDS_NHPI     0.0006
UGDS_2MOR     0.0012
UGDS_NRA      0.0033
UGDS_UNKN     0.0324
Name: Northwest-Shoals Community College, dtype: float64

In [161]:
college_ugds.loc[name].round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.12
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [162]:
# If we add .0001 before rounding, it changes to
# rounding up:
(college_ugds.loc[name] + 0.0001).round(2)

UGDS_WHITE    0.79
UGDS_BLACK    0.13
UGDS_HISP     0.03
UGDS_ASIAN    0.00
UGDS_AIAN     0.01
UGDS_NHPI     0.00
UGDS_2MOR     0.00
UGDS_NRA      0.00
UGDS_UNKN     0.03
Name: Northwest-Shoals Community College, dtype: float64

In [166]:
# Let's do this to the DataFrame. To begin our rounding 
# adventure with operators, we will first add .00501
# to each value of college_ugds:
(college_ugds + 0.00501).head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.03831,0.94031,...,0.01091,0.01881
University of Alabama at Birmingham,0.59721,0.26501,...,0.02291,0.01501
Amridge University,0.30401,0.42421,...,0.00501,0.27651
University of Alabama in Huntsville,0.70381,0.13051,...,0.03821,0.04001
Alabama State University,0.02081,0.92581,...,0.02931,0.01871


In [165]:
# Use the floor division operator, //, to round down
# to the nearest whole number percentage:
(college_ugds + 0.00501) // 0.01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,3.0,94.0,...,1.0,1.0
University of Alabama at Birmingham,59.0,26.0,...,2.0,1.0
Amridge University,30.0,42.0,...,0.0,27.0
University of Alabama in Huntsville,70.0,13.0,...,3.0,4.0
Alabama State University,2.0,92.0,...,2.0,1.0
...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,...,,
Rasmussen College - Overland Park,,,...,,
National Personal Training Institute of Cleveland,,,...,,
Bay Area Medical Academy - San Jose Satellite Location,,,...,,


In [167]:
# To complete the rounding exercise, divide by 100:
college_ugds_op_round = (
    (college_ugds + 0.00501) // 0.01 / 100
)

In [169]:
college_ugds_op_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.03,0.94,...,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,...,0.02,0.01
Amridge University,0.3,0.42,...,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,...,0.03,0.04
Alabama State University,0.02,0.92,...,0.02,0.01


In [170]:
# Now use the round DataFrame method to do the rounding 
# automatically for us. Due to bankers rounding, 
# we add a small fraction before rounding:
college_ugds_round = (college_ugds + 0.00001).round(2)

In [172]:
college_ugds_round.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.03,0.94,...,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,...,0.02,0.01
Amridge University,0.3,0.42,...,0.0,0.27
University of Alabama in Huntsville,0.7,0.13,...,0.03,0.04
Alabama State University,0.02,0.92,...,0.02,0.01


In [173]:
college_ugds_op_round.equals(college_ugds_round)

True

In [174]:
college_ugds_op_round == college_ugds_round

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,True,True,...,True,True
University of Alabama at Birmingham,True,True,...,True,True
Amridge University,True,True,...,True,True
University of Alabama in Huntsville,True,True,...,True,True
Alabama State University,True,True,...,True,True
...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,...,False,False
Rasmussen College - Overland Park,False,False,...,False,False
National Personal Training Institute of Cleveland,False,False,...,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,...,False,False


## Comparing missing values

In [179]:
np.nan == np.nan

False

In [180]:
None == None

True

In [181]:
# All other comparisons against np.nan also return
# False, except not equal to (!=):
np.nan > 5

False

In [182]:
5 > np.nan

False

In [183]:
np.nan != 5

True

In [184]:
college = pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college.csv', index_col="INSTNM")

In [185]:
college_ugds = college.filter(like="UGDS_")

In [186]:
# To get an idea of how the equals operator works, 
# let's compare each element to a scalar value:
college_ugds == 0.0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,False,False,...,False,False
University of Alabama at Birmingham,False,False,...,False,False
Amridge University,False,False,...,False,False
University of Alabama in Huntsville,False,False,...,False,False
Alabama State University,False,False,...,False,False
...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,...,False,False
Rasmussen College - Overland Park,False,False,...,False,False
National Personal Training Institute of Cleveland,False,False,...,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,...,False,False


In [188]:
college_self_compare = college_ugds == college_ugds

college_self_compare.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,True,True,...,True,True
University of Alabama at Birmingham,True,True,...,True,True
Amridge University,True,True,...,True,True
University of Alabama in Huntsville,True,True,...,True,True
Alabama State University,True,True,...,True,True


In [189]:
# At first glance, all the values appear to be equal,
# as you would expect. However, using the .all method
#to determine if each column contains only True values
# yields an unexpected result:
college_self_compare.all()

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [190]:
# This happens because missing values do not compare
# equally with one another. If you tried to count 
# missing values using the equal operator and summing 
# up the Boolean columns, you would get zero for 
# each one:
(college_ugds == np.nan).sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [191]:
# instead of using == to find missing numbers, use the 
# .isna method:
college_ugds.isna().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [192]:
# The correct way to compare two entire DataFrames 
# with one another is not with the equals operator
# (==) but with the .equals method. This method treats
# NaNs that are in the same location as equal 
# (note that the .eq method is the equivalent of ==):

college_ugds.equals(college_ugds)

True

In [193]:
# the .eq DataFrame method does element-by-element
# comparison, just like the equals (==) operator.
# The .eq method is not at all the same as the .equals
# method. The following code duplicates step 1:
college_ugds.eq(0.0019)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,False,False,...,False,False
University of Alabama at Birmingham,False,False,...,False,False
Amridge University,False,False,...,False,False
University of Alabama in Huntsville,False,False,...,False,False
Alabama State University,False,False,...,False,False
...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,...,False,False
Rasmussen College - Overland Park,False,False,...,False,False
National Personal Training Institute of Cleveland,False,False,...,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,...,False,False


In [195]:
# Inside the pandas.testing sub-package, a function 
# exists that developers should use when creating unit
# tests. The assert_frame_equal function raises an
# AssertionError if two DataFrames are not equal.
# It returns None if the two DataFrames are equal
from pandas.testing import assert_frame_equal
assert_frame_equal(college_ugds, college_ugds) is None

True

## Transposing the direction of a DataFrame operation

In [198]:
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.0333,0.9353,...,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,...,0.0179,0.01
Amridge University,0.299,0.4192,...,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,...,0.0332,0.035
Alabama State University,0.0158,0.9208,...,0.0243,0.0137


In [203]:
# The .count method returns the number of non-missing
# values. By default, its axis parameter is set to 0:
college_ugds.count()

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [204]:
# Changing the axis parameter to 'columns' changes the
# direction of the operation so that we get back a
# count of non-missing items in each row:
college_ugds.count(axis=1).head()  # or use "columns"

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [209]:
# Changing the axis parameter to 'columns' changes the
# direction of the operation so that we get back a 
# count of non-missing items in each row:

college_ugds.sum(axis="columns").head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [212]:
# To get an idea of the distribution of each column, 
# the .median method can be used:
college_ugds.median(axis="index")

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

In [213]:
#The .cumsum method with axis=1 accumulates the race
# percentages across each row. It gives a slightly
# different view of the data
college_ugds_cumsum = college_ugds.cumsum(axis="columns")

In [214]:
college_ugds_cumsum

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,0.0333,0.9686,...,0.9862,1.0000
University of Alabama at Birmingham,0.5922,0.8522,...,0.9899,0.9999
Amridge University,0.2990,0.7182,...,0.7285,1.0000
University of Alabama in Huntsville,0.6988,0.8243,...,0.9650,1.0000
Alabama State University,0.0158,0.9366,...,0.9863,1.0000
...,...,...,...,...,...
SAE Institute of Technology San Francisco,,,...,,
Rasmussen College - Overland Park,,,...,,
National Personal Training Institute of Cleveland,,,...,,
Bay Area Medical Academy - San Jose Satellite Location,,,...,,


## Determining college campus diversity

In [216]:
pd.read_csv('C:/Users/justine.o_kobo360/Desktop/Pandas Workbook/Pandas CookBook 1.x/Data files/college_diversity.csv', index_col="School")

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74
"University of Houston Houston, TX",0.74
"University of Nevada--Las Vegas Las Vegas, NV",0.74
"University of San Francisco San Francisco, CA",0.74
"San Francisco State University San Francisco, CA",0.73
"University of Illinois--Chicago Chicago, IL",0.73
"New Jersey Institute of Technology Newark, NJ",0.72
"Texas Woman's University Denton, TX",0.72


In [219]:
# Many of these colleges have missing values for all
# their race columns. We can count all the missing
# values for each row and sort the resulting Series
# from the highest to lowest.
(
    college_ugds.isnull()
    .sum(axis="columns")
    .sort_values(ascending=False)
    .head()
)

INSTNM
Excel Learning Center-San Antonio South              9
Western State College of Law at Argosy University    9
Albany Law School                                    9
Albany Medical College                               9
A T Still University of Health Sciences              9
dtype: int64

In [222]:
# use the .dropna method to drop all rows that have 
# all nine race percentages missing. We can then 
# count the remaining missing values:
college_ugds = college_ugds.dropna(how="all")
college_ugds.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [223]:
# To get started, we will use the greater than or equal 
# DataFrame method, .ge, to return a DataFrame with a
# Boolean value for each cell
college_ugds.ge(0.15)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama A & M University,False,True,...,False,False
University of Alabama at Birmingham,True,True,...,False,False
Amridge University,True,True,...,False,True
University of Alabama in Huntsville,True,False,...,False,False
Alabama State University,False,True,...,False,False
...,...,...,...,...,...
Hollywood Institute of Beauty Careers-West Palm Beach,True,True,...,False,False
Hollywood Institute of Beauty Careers-Casselberry,False,True,...,False,False
Coachella Valley Beauty College-Beaumont,True,False,...,False,False
Dewey University-Mayaguez,False,False,...,False,False


In [225]:
# From here, we can use the .sum method to count the
# True values for each college. Notice that a Series is returned:
diversity_metric = college_ugds.ge(0.15).sum(
    axis="columns"
)

diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [226]:
# To get an idea of the distribution, we will use the 
# .value_counts method on this Series:
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [227]:
# Amazingly, two schools have more than 15% in five 
# different race categories. Let's sort the 
# diversity_metric Series to find out which ones 
# they are:
diversity_metric.sort_values(ascending=False).head()

INSTNM
Central Texas Beauty College-Temple                               5
Regency Beauty Institute-Austin                                   5
Westwood College-O'Hare Airport                                   4
Regency Beauty Institute-Pasadena                                 4
Soma Institute-The National School of Clinical Massage Therapy    4
dtype: int64

In [228]:
# It seems a little suspicious that schools can be
# that diverse. Let's look at the raw percentages
# from these top two schools. We will use .loc to 
# select rows based on the index label:
college_ugds.loc[
    [
        "Regency Beauty Institute-Austin",
        "Central Texas Beauty College-Temple",
    ]
]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,...,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,...,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,...,0.0,0.1515


In [230]:
us_news_top = [
    "Rutgers University-Newark",
    "Andrews University",
    "Stanford University",
    "University of Houston",
    "University of Nevada-Las Vegas",
    
]


diversity_metric.loc[us_news_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64

In [231]:
# Alternatively, we can find the schools that are 
# least diverse by ordering them by their
# maximum race percentage:
(
    college_ugds.max(axis=1)
    .sort_values(ascending=False)
    .head(10)
)

INSTNM
Caribbean University-Ponce                                        1.0
Brighton Institute of Cosmetology                                 1.0
Mesivta Torah Vodaath Rabbinical Seminary                         1.0
Rabbinical College Telshe                                         1.0
University of Puerto Rico-Mayaguez                                1.0
Haskell Indian Nations University                                 1.0
Lake Career and Technical Center                                  1.0
Leon Studio One School of Hair Design & Career Training Center    1.0
Dewey University-Hato Rey                                         1.0
Columbia Central University-Caguas                                1.0
dtype: float64

In [232]:
# We can also determine if any school has all nine 
# race categories exceeding 1%:
(college_ugds > 0.01).all(axis=1).any()

True

In [2]:
import httpx

In [5]:
r = httpx.get('http://www.kobo360.com')

In [6]:
r

<Response [301 Moved Permanently]>

In [7]:
r.status_code

301

In [8]:
r.headers['content-type']

'text/html'

In [9]:
r.text

'<html>\r\n<head><title>301 Moved Permanently</title></head>\r\n<body bgcolor="white">\r\n<center><h1>301 Moved Permanently</h1></center>\r\n<hr><center>CloudFront</center>\r\n</body>\r\n</html>\r\n'