# Pandas Foundations

In [159]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn-whitegrid')

## Introduction

## Dissecting the anatomy of a DataFrame

In [160]:
movies = pd.read_csv('datasets/movie.csv')
movies.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [161]:
movies.shape

(5043, 28)

## DataFrame Attributes

### How to do it... {#how-to-do-it-1}

In [162]:
movies = pd.read_csv('datasets/movie.csv')
columns = movies.columns
index = movies.index
data = movies.values

In [163]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [164]:
index

RangeIndex(start=0, stop=5043, step=1)

In [165]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [166]:
type(index)

pandas.core.indexes.range.RangeIndex

In [167]:
type(columns)

pandas.core.indexes.base.Index

In [168]:
type(data)

numpy.ndarray

In [169]:
issubclass(pd.RangeIndex, pd.Index)

True

### There's more

In [170]:
index.values

array([   0,    1,    2, ..., 5040, 5041, 5042], dtype=int64)

In [171]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

## Understanding data types

### How to do it... {#how-to-do-it-2}

In [172]:
movies.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [173]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      5024 non-null   object 
 1   director_name              4939 non-null   object 
 2   num_critic_for_reviews     4993 non-null   float64
 3   duration                   5028 non-null   float64
 4   director_facebook_likes    4939 non-null   float64
 5   actor_3_facebook_likes     5020 non-null   float64
 6   actor_2_name               5030 non-null   object 
 7   actor_1_facebook_likes     5036 non-null   float64
 8   gross                      4159 non-null   float64
 9   genres                     5043 non-null   object 
 10  actor_1_name               5036 non-null   object 
 11  movie_title                5043 non-null   object 
 12  num_voted_users            5043 non-null   int64  
 13  cast_total_facebook_likes  5043 non-null   int64

In [207]:
movies.describe().T  # include=[np.number]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
num_critic_for_reviews,4993.0,140.1943,121.6017,1.0,50.0,110.0,195.0,813.0
duration,5028.0,107.2011,25.19744,7.0,93.0,103.0,118.0,511.0
director_facebook_likes,4939.0,686.5092,2813.329,0.0,7.0,49.0,194.5,23000.0
actor_3_facebook_likes,5020.0,645.0098,1665.042,0.0,133.0,371.5,636.0,23000.0
actor_1_facebook_likes,5036.0,6560.047,15020.76,0.0,614.0,988.0,11000.0,640000.0
gross,4159.0,48468410.0,68452990.0,162.0,5340987.5,25517500.0,62309437.5,760505800.0
num_voted_users,5043.0,83668.16,138485.3,5.0,8593.5,34359.0,96309.0,1689764.0
cast_total_facebook_likes,5043.0,9699.064,18163.8,0.0,1411.0,3090.0,13756.5,656730.0
facenumber_in_poster,5030.0,1.371173,2.013576,0.0,0.0,1.0,2.0,43.0
num_user_for_reviews,5022.0,272.7708,377.9829,1.0,65.0,156.0,326.0,5060.0


In [175]:
movies.describe(include=[np.object, pd.Categorical]).T

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  movies.describe(include=[np.object, pd.Categorical]).T


Unnamed: 0,count,unique,top,freq
color,5024,2,Color,4815
director_name,4939,2398,Steven Spielberg,26
actor_2_name,5030,3032,Morgan Freeman,20
genres,5043,914,Drama,236
actor_1_name,5036,2097,Robert De Niro,49
movie_title,5043,4917,Ben-Hur,3
actor_3_name,5020,3521,John Heard,8
plot_keywords,4890,4760,based on novel,4
movie_imdb_link,5043,4919,http://www.imdb.com/title/tt0232500/?ref_=fn_t...,3
language,5031,47,English,4704


In [176]:
movies.describe(include=[np.number],
   percentiles=[.01, .05, .10, .25, .5, .75, .9, .95, .99]).T

# movies.select_dtypes(include='number').head()
# movies.select_dtypes(include=['int', 'object']).head()

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
num_critic_for_reviews,4993.0,140.1943,121.6017,1.0,2.0,9.0,17.2,50.0,110.0,195.0,299.8,387.0,548.08,813.0
duration,5028.0,107.2011,25.19744,7.0,43.0,81.0,86.0,93.0,103.0,118.0,134.0,146.0,189.0,511.0
director_facebook_likes,4939.0,686.5092,2813.329,0.0,0.0,0.0,0.0,7.0,49.0,194.5,545.0,973.0,16000.0,23000.0
actor_3_facebook_likes,5020.0,645.0098,1665.042,0.0,0.0,10.0,34.0,133.0,371.5,636.0,897.0,1000.0,11000.0,23000.0
actor_1_facebook_likes,5036.0,6560.047,15020.76,0.0,7.0,95.5,248.5,614.0,988.0,11000.0,18500.0,24000.0,44000.0,640000.0
gross,4159.0,48468410.0,68452990.0,162.0,8497.8,99034.0,382601.8,5340987.5,25517500.0,62309437.5,125025200.0,180029700.0,333573600.0,760505800.0
num_voted_users,5043.0,83668.16,138485.3,5.0,54.26,514.6,1643.4,8593.5,34359.0,96309.0,216562.0,332254.9,681094.3,1689764.0
cast_total_facebook_likes,5043.0,9699.064,18163.8,0.0,6.42,179.0,525.2,1411.0,3090.0,13756.5,25914.4,36927.7,63027.24,656730.0
facenumber_in_poster,5030.0,1.371173,2.013576,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,5.0,8.0,43.0
num_user_for_reviews,5022.0,272.7708,377.9829,1.0,1.21,10.0,21.1,65.0,156.0,326.0,630.0,907.8,2010.11,5060.0


## Reducing memory by changing data types

### How to do it...

In [177]:
college = pd.read_csv('datasets/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',
   'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [178]:
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [179]:
original_mem = col2.memory_usage(deep=True)
original_mem

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660216
STABBR      444565
dtype: int64

In [180]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    

In [181]:
col2.dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [182]:
college[different_cols].memory_usage(deep=True)

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660216
STABBR      444565
dtype: int64

In [183]:
col2.select_dtypes(include=['object']).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [184]:
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [185]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660641
STABBR       13120
dtype: int64

In [186]:
new_mem / original_mem

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000644
STABBR      0.029512
dtype: float64

### NaN value

In [187]:
pd.Series(['Paul', np.nan, 'George']).info()

<class 'pandas.core.series.Series'>
RangeIndex: 3 entries, 0 to 2
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
2 non-null      object
dtypes: object(1)
memory usage: 152.0+ bytes


### There's more...

In [188]:
college.loc[0, 'CURROPER'] = 10000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

Index          128
CURROPER     60280
INSTNM      660642
dtype: int64

In [189]:
college['MENONLY'].dtype

dtype('float64')

In [190]:
#college['MENONLY'].astype('int8')
college['MENONLY'].astype('float16')

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
7530    NaN
7531    NaN
7532    NaN
7533    NaN
7534    NaN
Name: MENONLY, Length: 7535, dtype: float16

In [191]:
college.assign(MENONLY=college['MENONLY'].astype('float16'),
    RELAFFIL=college['RELAFFIL'].astype('int8'))

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M Universitya,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0000,0.0059,0.0138,0.0656,10000000,0.7356,0.8284,0.1049,30300,33888
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.0100,0.2607,1,0.3460,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0000,0.0000,0.2715,0.4536,1,0.6801,0.7795,0.8540,40100,23370
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.0350,0.2146,1,0.3072,0.4596,0.2640,45500,24097
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.1270,26600,33118.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7530,SAE Institute of Technology San Francisco,Emeryville,CA,,,,1,,,,...,,,,,1,,,,,9500
7531,Rasmussen College - Overland Park,Overland Park,KS,,,,1,,,,...,,,,,1,,,,,21163
7532,National Personal Training Institute of Cleveland,Highland Heights,OH,,,,1,,,,...,,,,,1,,,,,6333
7533,Bay Area Medical Academy - San Jose Satellite ...,San Jose,CA,,,,1,,,,...,,,,,1,,,,,PrivacySuppressed


In [192]:
college.index = pd.Int64Index(college.index)
college.index.memory_usage() # previously was just 80

  college.index = pd.Int64Index(college.index)


60280

## Selecting the smallest of the largest

### How to do it...

In [193]:
movie = pd.read_csv('datasets/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens ...,7.1,


In [194]:
movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2765,Towering Inferno,9.5,
1937,The Shawshank Redemption,9.3,25000000.0
3466,The Godfather,9.2,6000000.0
2824,Dekalog,9.1,
3207,Dekalog,9.1,


In [195]:
(movie2
  .nlargest(100, 'imdb_score')
  .nsmallest(5, 'budget')
)

Unnamed: 0,movie_title,imdb_score,budget
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0
2242,Psycho,8.5,806947.0


## Selecting the largest of each group by sorting

### How to do it...

In [196]:
movie = pd.read_csv('datasets/movie.csv')
movie[['movie_title', 'title_year', 'imdb_score']]

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates of the Caribbean: At World's End,2007.0,7.1
2,Spectre,2015.0,6.8
3,The Dark Knight Rises,2012.0,8.5
4,Star Wars: Episode VII - The Force Awakens ...,,7.1
...,...,...,...
5038,Signed Sealed Delivered,2013.0,7.7
5039,The Following,,7.5
5040,A Plague So Pleasant,2013.0,6.3
5041,Shanghai Calling,2012.0,6.3


In [197]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values('title_year', ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
2366,Fight Valley,2016.0,5.0
3817,Yoga Hosers,2016.0,4.8
1367,The 5th Wave,2016.0,5.2
1742,The Boss,2016.0,5.3
519,The Secret Life of Pets,2016.0,6.8
...,...,...,...
4798,Heroes,,7.7
4803,Home Movies,,8.2
4819,Revolution,,6.7
4869,Happy Valley,,8.5


In [198]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
4372,A Beginner's Guide to Snuff,2016.0,8.7
3870,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2
...,...,...,...
1404,Rush Hour,,5.8
4113,Creature,,5.0
2191,Meet the Browns,,3.5
3303,The Bold and the Beautiful,,3.5


In [199]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
  .drop_duplicates(subset='title_year')
)

Unnamed: 0,movie_title,title_year,imdb_score
4409,Kickboxer: Vengeance,2016.0,9.1
3816,Running Forever,2015.0,8.6
4468,Queen of the Mountains,2014.0,8.7
4017,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5
...,...,...,...
2734,Metropolis,1927.0,8.3
4885,The Big Parade,1925.0,8.3
4958,Over the Hill to the Poorhouse,1920.0,4.8
4810,Intolerance: Love's Struggle Throughout the Ages,1916.0,8.0


## There's more...

In [200]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .groupby('title_year', as_index=False)
  .apply(lambda df: df.sort_values('imdb_score',
         ascending=False).head(1))
  .sort_values('title_year', ascending=False)
)

Unnamed: 0,Unnamed: 1,movie_title,title_year,imdb_score
90,4409,Kickboxer: Vengeance,2016.0,9.1
89,3816,Running Forever,2015.0,8.6
88,4468,Queen of the Mountains,2014.0,8.7
87,4017,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
86,3,The Dark Knight Rises,2012.0,8.5
...,...,...,...,...
4,4664,Pandora's Box,1929.0,8.0
3,2734,Metropolis,1927.0,8.3
2,4885,The Big Parade,1925.0,8.3
1,4958,Over the Hill to the Poorhouse,1920.0,4.8


In [201]:
(movie
  [['movie_title', 'title_year',
    'content_rating', 'budget']]
   .sort_values(['title_year',
       'content_rating', 'budget'],
       ascending=[False, False, True])
   .drop_duplicates(subset=['title_year',
        'content_rating'])
)

Unnamed: 0,movie_title,title_year,content_rating,budget
4108,Compadres,2016.0,R,3000000.0
4772,Fight to the Finish,2016.0,PG-13,150000.0
4775,Rodeo Girl,2016.0,PG,500000.0
3309,The Wailing,2016.0,Not Rated,
4773,Alleluia! The Devil's Carnival,2016.0,,500000.0
...,...,...,...,...
2594,Lilyhammer,,TV-MA,34000000.0
816,"Sabrina, the Teenage Witch",,TV-G,3000000.0
857,Stargate SG-1,,TV-14,1400000.0
2466,Carlos,,Not Rated,


## Replicating nlargest with sort_values

### How to do it...

In [202]:
movie = pd.read_csv('datasets/movie.csv')
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score') 
   .nsmallest(5, 'budget')
)

Unnamed: 0,movie_title,imdb_score,budget
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0
2242,Psycho,8.5,806947.0


In [203]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100)
)

Unnamed: 0,movie_title,imdb_score,budget
2765,Towering Inferno,9.5,
1937,The Shawshank Redemption,9.3,25000000.0
3466,The Godfather,9.2,6000000.0
4409,Kickboxer: Vengeance,9.1,17000000.0
2824,Dekalog,9.1,
...,...,...,...
3849,Requiem for a Dream,8.4,4500000.0
3623,Veronica Mars,8.4,
3685,Rang De Basanti,8.4,
2362,The Shining,8.4,19000000.0


In [204]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100) 
   .sort_values('budget')
   .head(5)
)

Unnamed: 0,movie_title,imdb_score,budget
4937,A Charlie Brown Christmas,8.4,150000.0
4924,Butterfly Girl,8.7,180000.0
4921,Children of Heaven,8.5,180000.0
4822,12 Angry Men,8.9,350000.0
4659,A Separation,8.4,500000.0


### How it works...

In [205]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score')
   .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
4017,"Batman: The Dark Knight Returns, Part 2",8.4,3500000.0
4105,Oldboy,8.4,3000000.0
4253,To Kill a Mockingbird,8.4,2000000.0
4496,Reservoir Dogs,8.4,1200000.0
4659,A Separation,8.4,500000.0


In [206]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False) 
   .head(100)
   .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
3849,Requiem for a Dream,8.4,4500000.0
3623,Veronica Mars,8.4,
3685,Rang De Basanti,8.4,
2362,The Shining,8.4,19000000.0
3807,Psych,8.4,


## Selecting a Column

### How to do it... {#how-to-do-it-3}

In [48]:
movies['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
5038          Scott Smith
5039                  NaN
5040     Benjamin Roberds
5041          Daniel Hsia
5042             Jon Gunn
Name: director_name, Length: 5043, dtype: object

In [49]:
movies.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
5038          Scott Smith
5039                  NaN
5040     Benjamin Roberds
5041          Daniel Hsia
5042             Jon Gunn
Name: director_name, Length: 5043, dtype: object

In [50]:
movies.loc[:, 'director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
5038          Scott Smith
5039                  NaN
5040     Benjamin Roberds
5041          Daniel Hsia
5042             Jon Gunn
Name: director_name, Length: 5043, dtype: object

In [51]:
movies.iloc[:, 1]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
5038          Scott Smith
5039                  NaN
5040     Benjamin Roberds
5041          Daniel Hsia
5042             Jon Gunn
Name: director_name, Length: 5043, dtype: object

In [52]:
movies['director_name'].index

RangeIndex(start=0, stop=5043, step=1)

In [53]:
movies['director_name'].dtype

dtype('O')

In [54]:
movies['director_name'].size

5043

In [55]:
movies['director_name'].name

'director_name'

In [56]:
type(movies['director_name'])

pandas.core.series.Series

In [57]:
movies['director_name'].apply(type).unique()

array([<class 'str'>, <class 'float'>], dtype=object)

In [58]:
idx = movies['duration'].idxmax()
movies.iloc[idx,:10]



color                                         Color
director_name                                   NaN
num_critic_for_reviews                         16.0
duration                                      511.0
director_facebook_likes                         NaN
actor_3_facebook_likes                         51.0
actor_2_name               Ingvar Eggert Sigurðsson
actor_1_facebook_likes                        147.0
gross                                           NaN
genres                         Crime|Drama|Thriller
Name: 1710, dtype: object

## Calling Series Methods

In [59]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

422

In [60]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

434

In [61]:
#len(s_attr_methods & df_attr_methods)
len(s_attr_methods.intersection(df_attr_methods))

367

### How to do it... {#how-to-do-it-4}

In [62]:
movies = pd.read_csv('datasets/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

In [63]:
print(director.dtype)
print(fb_likes.dtype)

object
float64


In [64]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [65]:
director.tail()

5038         Scott Smith
5039                 NaN
5040    Benjamin Roberds
5041         Daniel Hsia
5042            Jon Gunn
Name: director_name, dtype: object

In [66]:
director.sample(n=5, random_state=42)

4943              Sai Varadan
1919          Craig Gillespie
1049            James Mangold
4697    Jennifer Wynne Farmer
3312            Dario Argento
Name: director_name, dtype: object

In [67]:
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [68]:
director.value_counts()

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
Ridley Scott        17
                    ..
John Crowley         1
Rob Pritts           1
David S. Ward        1
R.J. Cutler          1
Daniel Hsia          1
Name: director_name, Length: 2398, dtype: int64

In [69]:
fb_likes.value_counts()

1000.0     449
11000.0    211
2000.0     197
3000.0     155
12000.0    135
          ... 
274.0        1
175.0        1
961.0        1
230.0        1
291.0        1
Name: actor_1_facebook_likes, Length: 878, dtype: int64

In [70]:
director.size

5043

In [71]:
director.shape

(5043,)

In [72]:
len(director)

5043

In [73]:
director.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'], dtype=object)

In [74]:
director.count()
# ?pd.DataFrame.count

4939

In [75]:
fb_likes.count()

5036

In [76]:
import numpy as np
q = np.arange(0,1.05,0.05)
fb_likes.quantile(q)

# fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.00         0.0
0.05        95.5
0.10       248.5
0.15       403.0
0.20       522.0
0.25       614.0
0.30       700.0
0.35       786.0
0.40       862.0
0.45       931.0
0.50       988.0
0.55      1000.0
0.60      2000.0
0.65      3000.0
0.70      8000.0
0.75     11000.0
0.80     13000.0
0.85     15000.0
0.90     18500.0
0.95     24000.0
1.00    640000.0
Name: actor_1_facebook_likes, dtype: float64

In [77]:
fb_likes.min()

0.0

In [78]:
fb_likes.max()

640000.0

In [79]:
fb_likes.mean()

6560.04706115965

In [80]:
fb_likes.median()

988.0

In [81]:
fb_likes.std()

15020.759119984092

In [82]:
fb_likes.describe()

count      5036.000000
mean       6560.047061
std       15020.759120
min           0.000000
25%         614.000000
50%         988.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [83]:
director.describe()

count                 4939
unique                2398
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [84]:
director.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
5038    False
5039     True
5040    False
5041    False
5042    False
Name: director_name, Length: 5043, dtype: bool

In [85]:
print(f"is not nan counts = {fb_likes.count()}")

fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count()

is not nan counts = 5036


5043

In [86]:
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

5036

### There's more...

In [87]:
director.value_counts(normalize=True)

Steven Spielberg    0.005264
Woody Allen         0.004454
Clint Eastwood      0.004049
Martin Scorsese     0.004049
Ridley Scott        0.003442
                      ...   
John Crowley        0.000202
Rob Pritts          0.000202
David S. Ward       0.000202
R.J. Cutler         0.000202
Daniel Hsia         0.000202
Name: director_name, Length: 2398, dtype: float64

In [88]:
director.hasnans

True

In [89]:
director.notna()

0        True
1        True
2        True
3        True
4        True
        ...  
5038     True
5039    False
5040     True
5041     True
5042     True
Name: director_name, Length: 5043, dtype: bool

## Series Operations

In [90]:
5 + 9    # plus operator example. Adds 5 and 9

14

### How to do it... {#how-to-do-it-5}

In [91]:
movies = pd.read_csv('datasets/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
5038    7.7
5039    7.5
5040    6.3
5041    6.3
5042    6.6
Name: imdb_score, Length: 5043, dtype: float64

In [92]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
5038    8.7
5039    8.5
5040    7.3
5041    7.3
5042    7.6
Name: imdb_score, Length: 5043, dtype: float64

In [93]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
5038    19.25
5039    18.75
5040    15.75
5041    15.75
5042    16.50
Name: imdb_score, Length: 5043, dtype: float64

In [94]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
5038    1.0
5039    1.0
5040    0.0
5041    0.0
5042    0.0
Name: imdb_score, Length: 5043, dtype: float64

In [95]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
5038     True
5039     True
5040    False
5041    False
5042    False
Name: imdb_score, Length: 5043, dtype: bool

In [96]:
director = movies['director_name']
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
5038    False
5039    False
5040    False
5041    False
5042    False
Name: director_name, Length: 5043, dtype: bool

### There's more...

In [97]:
imdb_score.add(1)   # imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
5038    8.7
5039    8.5
5040    7.3
5041    7.3
5042    7.6
Name: imdb_score, Length: 5043, dtype: float64

In [98]:
imdb_score.gt(7)   # imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
5038     True
5039     True
5040    False
5041    False
5042    False
Name: imdb_score, Length: 5043, dtype: bool

## Chaining Series Methods

### How to do it... {#how-to-do-it-6}

In [99]:
movies = pd.read_csv('datasets/movie.csv')
fb_likes = movies['actor_1_facebook_likes']
director = movies['director_name']

In [100]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Name: director_name, dtype: int64

In [101]:
fb_likes.isna().sum()

7

In [102]:
fb_likes.dtype

dtype('float64')

In [103]:
(fb_likes.fillna(0)
         .astype(int)
         .head()
)

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

### How it works...

### There's more...

In [104]:
(fb_likes.fillna(0)
         #.astype(int)
         #.head()
)

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
5038      637.0
5039      841.0
5040        0.0
5041      946.0
5042       86.0
Name: actor_1_facebook_likes, Length: 5043, dtype: float64

In [105]:
(fb_likes.fillna(0)
         .astype(int)
         #.head()
)

0        1000
1       40000
2       11000
3       27000
4         131
        ...  
5038      637
5039      841
5040        0
5041      946
5042       86
Name: actor_1_facebook_likes, Length: 5043, dtype: int32

In [106]:
fb_likes.isna().mean()

0.001388062661114416

In [107]:
fb_likes.fillna(0) \
        .astype(int) \
        .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [108]:
def debug_df(df):
    print("BEFORE")
    print(df)
    print("AFTER")
    return df

In [109]:
(fb_likes.fillna(0)
         .pipe(debug_df)
         .astype(int) 
         .head()
)

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
5038      637.0
5039      841.0
5040        0.0
5041      946.0
5042       86.0
Name: actor_1_facebook_likes, Length: 5043, dtype: float64
AFTER


0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [110]:
intermediate = None
def get_intermediate(df):
    global intermediate
    intermediate = df
    return df

In [111]:
res = (fb_likes.fillna(0)
         .pipe(get_intermediate)
         .astype(int) 
         .head()
)

In [112]:
intermediate

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
5038      637.0
5039      841.0
5040        0.0
5041      946.0
5042       86.0
Name: actor_1_facebook_likes, Length: 5043, dtype: float64

## Renaming Column Names

### How to do it...

In [113]:
movies = pd.read_csv('datasets/movie.csv')

In [114]:
col_map = {'director_name':'Director Name', 
             'num_critic_for_reviews': 'Critical Reviews'} 

In [115]:
movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


### How it works... {#how-it-works-8}

### There's more {#theres-more-7}

In [116]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .set_index('movie_title')
   .rename(index=idx_map, columns=col_map)
   .head(3)
)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [117]:
movies = pd.read_csv('datasets/movie.csv', index_col='movie_title')
ids = movies.index.tolist()
columns = movies.columns.tolist()

# rename the row and column labels with list assignments

In [118]:
ids[0] = 'Ratava'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[1] = 'director'
columns[-2] = 'aspect'
columns[-1] = 'fblikes'
movies.index = ids
movies.columns = columns

In [119]:
movies.head(3)

Unnamed: 0,color,director,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [120]:
def to_clean(val):
    return val.strip().lower().replace(' ', '_')

In [121]:
movies.rename(columns=to_clean).head(3)

Unnamed: 0,color,director,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


In [122]:
cols = [col.strip().lower().replace(' ', '_')
        for col in movies.columns]
movies.columns = cols
movies.head(3)

Unnamed: 0,color,director,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


## Creating and Deleting columns

### How to do it... {#how-to-do-it-9}

In [123]:
movies = pd.read_csv('datasets/movie.csv')
movies['has_seen'] = 0

In [124]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .rename(index=idx_map, columns=col_map)
   .assign(has_seen=0)
)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes,has_seen
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,12.0,7.1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,English,Canada,,,2013.0,470.0,7.7,,84,0
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,English,USA,TV-14,,,593.0,7.5,16.00,32000,0
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,English,USA,,1400.0,2013.0,0.0,6.3,,16,0
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660,0


In [125]:
total = (movies['actor_1_facebook_likes'] +
         movies['actor_2_facebook_likes'] + 
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])

In [126]:
total.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4        NaN
dtype: float64

In [127]:
cols = ['actor_1_facebook_likes','actor_2_facebook_likes',
    'actor_3_facebook_likes','director_facebook_likes']
sum_col = movies[cols].sum(axis='columns')
sum_col.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4      274.0
dtype: float64

In [128]:
movies.assign(total_likes=sum_col).head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,total_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,2791.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,46563.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,11554.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,95000.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,274.0


In [129]:
def sum_likes(df):
   return df[[c for c in df.columns
              if 'like' in c]].sum(axis=1)

In [130]:
movies.assign(total_likes=sum_likes).head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,has_seen,total_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,0,40625.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,0,94913.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,0,108254.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,0,365759.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,0,417.0


In [131]:
(movies
   .assign(total_likes=sum_col)
   ['total_likes']
   .isna()
   .sum()
)

0

In [132]:
(movies
   .assign(total_likes=total)
   ['total_likes']
   .isna()
   .sum()
)

124

In [133]:
(movies
   .assign(total_likes=total.fillna(0))
   ['total_likes']
   .isna()
   .sum()
)

0

In [134]:
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= \
           df['total_likes']

In [135]:
df2 = (movies
   .assign(total_likes=total,
           is_cast_likes_more = cast_like_gt_actor_director)
)

In [136]:
df2['is_cast_likes_more'].all()

False

In [137]:
df2 = df2.drop(columns='total_likes')

In [138]:
actor_sum = (movies
   [[c for c in movies.columns if 'actor_' in c and '_likes' in c]]
   .sum(axis='columns')
)

In [139]:
actor_sum.head(5)

0     2791.0
1    46000.0
2    11554.0
3    73000.0
4      143.0
dtype: float64

In [140]:
movies['cast_total_facebook_likes'] >= actor_sum

0       True
1       True
2       True
3       True
4       True
        ... 
5038    True
5039    True
5040    True
5041    True
5042    True
Length: 5043, dtype: bool

In [141]:
movies['cast_total_facebook_likes'].ge(actor_sum)

0       True
1       True
2       True
3       True
4       True
        ... 
5038    True
5039    True
5040    True
5041    True
5042    True
Length: 5043, dtype: bool

In [142]:
movies['cast_total_facebook_likes'].ge(actor_sum).all()

True

In [143]:
pct_like = (actor_sum
    .div(movies['cast_total_facebook_likes'])
)

In [144]:
pct_like.describe()

count    5010.000000
mean        0.833868
std         0.140133
min         0.300767
25%         0.735823
50%         0.870054
75%         0.954726
max         1.000000
dtype: float64

In [145]:
pd.Series(pct_like.values,
    index=movies['movie_title'].values).head()

Avatar                                                     0.577369
Pirates of the Caribbean: At World's End                   0.951396
Spectre                                                    0.987521
The Dark Knight Rises                                      0.683783
Star Wars: Episode VII - The Force Awakens                 1.000000
dtype: float64

### There's more... {#theres-more-8}

In [146]:
profit_index = movies.columns.get_loc('gross') + 1
profit_index

9

In [147]:
movies.insert(loc=profit_index,
              column='profit',
              value=movies['gross'] - movies['budget'])

In [148]:
del movies['director_name']

## Reading from Zip File

In [149]:
# zf = zipfile.ZipFile("datasets/flights.csv.zip")
# fp = zf.extract(zf.filelist[0].filename, path='datasets/')
# df = pd.read_csv(fp, parse_dates=["FL_DATE"]).rename(columns=str.lower)

# df.info()