In [4]:
import pandas as pd
import numpy as np
from IPython.display import display

# Recipes
* [Dissecting the anatomy of a DataFrame](#Dissecting-the-anatomy-of-a-DataFrame)
* [Accessing the main DataFrame components](#Accessing-the-main-DataFrame-components)
* [Understanding data types](#Understanding-data-types)
* [Selecting a single Series](#Selecting-a-single-Series)
* [Calling Series methods](#Calling-Series-methods)
* [Working with operators on a Series](#Working-with-operators-on-a-Series)
* [Chaining Series methods together](#Chaining-Series-methods-together)
* [Making a meaningful index](#Making-a-meaningful-index)
* [Renaming row and column labels](#Renaming-row-and-column-labels)
* [Creating and deleting columns](#Creating-and-deleting-columns)

## change options for each recipe

In [74]:
pd.set_option('max_columns', 8, 'max_rows', 10)

# Dissecting the anatomy of a DataFrame

In [5]:
movie = pd.read_csv('data/movie.csv')
movie.head(20)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


![dataframe anatomy](./images/ch01_dataframe_anatomy.png)

# Accessing the main DataFrame components

In [9]:
columns = movie.columns
index = movie.index
data = movie.values

In [10]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [79]:
index

RangeIndex(start=0, stop=4916, step=1)

In [10]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [11]:
issubclass(pd.RangeIndex, pd.Index)

True

## There's more

In [12]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915])

In [13]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

# Understanding data types

In [11]:
movie = pd.read_csv('data/movie.csv')

In [12]:
movie.dtypes

color                         object
director_name                 object
num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_2_name                  object
actor_1_facebook_likes       float64
gross                        float64
genres                        object
actor_1_name                  object
movie_title                   object
num_voted_users                int64
cast_total_facebook_likes      int64
actor_3_name                  object
facenumber_in_poster         float64
plot_keywords                 object
movie_imdb_link               object
num_user_for_reviews         float64
language                      object
country                       object
content_rating                object
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
m

In [16]:
movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

# Selecting a single Series

In [17]:
movie = pd.read_csv('data/movie.csv')

In [18]:
movie['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [19]:
type(movie['director_name'])

pandas.core.series.Series

# There's more

In [20]:
director = movie['director_name'] # save Series to variable
director.name

'director_name'

In [21]:
director.to_frame().head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


# Calling Series methods

## Getting ready...

In [22]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

463

In [23]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

459

In [24]:
len(s_attr_methods & df_attr_methods)

398

## How to do it...

In [25]:
movie = pd.read_csv('data/movie.csv')

In [26]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [27]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [28]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [29]:
with pd.option_context('max_rows', 8):
    display(director.value_counts())

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
                    ..
Kevin Hamedani       1
Daryl Wein           1
John Guillermin      1
James O'Brien        1
Name: director_name, Length: 2397, dtype: int64

In [30]:
pd.set_option('max_rows', 8)
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
          ... 
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [31]:
director.size

4916

In [32]:
director.shape

(4916,)

In [33]:
len(director)

4916

In [34]:
director.count()

4814

In [35]:
actor_1_fb_likes.count()

4909

In [36]:
actor_1_fb_likes.quantile()

982.0

In [37]:
actor_1_fb_likes.min(), actor_1_fb_likes.max(), actor_1_fb_likes.mean(), \
    actor_1_fb_likes.median(), actor_1_fb_likes.std(), actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848185, 31881444.0)

In [38]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [39]:
actor_1_fb_likes.quantile(.2)

510.0

In [40]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
        ...   
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, Length: 9, dtype: float64

In [41]:
director.isnull()

0       False
1       False
2       False
3       False
        ...  
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [42]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)
actor_1_fb_likes_filled.count()

4916

In [43]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()
actor_1_fb_likes_dropped.size

4909

## There's more...

In [44]:
director.value_counts(normalize=True)

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
                      ...   
Kevin Hamedani      0.000208
Daryl Wein          0.000208
John Guillermin     0.000208
James O'Brien       0.000208
Name: director_name, Length: 2397, dtype: float64

In [45]:
director.hasnans

True

In [46]:
director.notnull()

0        True
1        True
2        True
3        True
        ...  
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

In [47]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

# Working with operators on a Series

In [48]:
pd.options.display.max_rows = 6

In [49]:
5 + 9    # plus operator example. Adds 5 and 9

14

In [50]:
4 ** 2   # exponentiation operator. Raises 4 to the second power

16

In [51]:
a = 10   # assignment operator.

In [52]:
5 <= 9   # less than or equal to operator

True

In [53]:
'abcde' + 'fg'    # plus operator for strings. C

'abcdefg'

In [54]:
not (5 <= 9)      # not is an operator that is a reserved keyword and reverse a boolean

False

In [55]:
7 in [1, 2, 6]    # in operator checks for membership of a list

False

In [56]:
set([1,2,3]) & set([2,3,4])

{2, 3}

In [57]:
[1, 2, 3] - 3

TypeError: unsupported operand type(s) for -: 'list' and 'int'

In [None]:
a = set([1,2,3])     
a[0]                 # the indexing operator does not work with sets

## Getting ready...

In [None]:
movie = pd.read_csv('data/movie.csv')
imdb_score = movie['imdb_score']
imdb_score

In [None]:
imdb_score + 1

In [None]:
imdb_score * 2.5

In [None]:
imdb_score // 7

In [None]:
imdb_score > 7

In [None]:
director = movie['director_name']

In [None]:
director == 'James Cameron'

## There's more...

In [None]:
imdb_score.add(1)              # imdb_score + 1

In [None]:
imdb_score.mul(2.5)            # imdb_score * 2.5

In [None]:
imdb_score.floordiv(7)         # imdb_score // 7

In [None]:
imdb_score.gt(7)               # imdb_score > 7

In [None]:
director.eq('James Cameron')   # director == 'James Cameron'

In [None]:
imdb_score.astype(int).mod(5)

In [None]:
a = type(1)

In [None]:
type(a)

In [None]:
a = type(imdb_score)

In [None]:
a([1,2,3])

# Chaining Series methods together

In [None]:
movie = pd.read_csv('data/movie.csv')
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [None]:
director.value_counts().head(3)

In [None]:
actor_1_fb_likes.isnull().sum()

In [None]:
actor_1_fb_likes.dtype

In [None]:
actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

## There's more...

In [None]:
actor_1_fb_likes.isnull().mean()

In [None]:
(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

# Making a meaningful index

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie.shape

In [None]:
movie2 = movie.set_index('movie_title')
movie2

In [None]:
pd.read_csv('data/movie.csv', index_col='movie_title')

# There's more...

In [None]:
movie2.reset_index()

# Renaming row and column labels

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')

In [None]:
indexes_renamed = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
columns_renamed = {'director_name':'Director Name', 
                       'num_critic_for_reviews': 'Critical Reviews'} 

In [None]:
movie.rename(index=indexes_renamed, columns=columns_renamed).head()

# There's more

In [None]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
index = movie.index
columns = movie.columns

index_list = index.tolist()
column_list = columns.tolist()

index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

In [None]:
print(index_list[:5])

In [None]:
print(column_list)

In [None]:
movie.index = index_list
movie.columns = column_list

In [None]:
movie.head()

# Creating and deleting columns

In [None]:
movie = pd.read_csv('data/movie.csv')

In [None]:
movie['has_seen'] = 0

In [None]:
movie.columns

In [58]:
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [59]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [60]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [61]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= 
                                  movie['actor_director_facebook_likes'])

In [62]:
movie['is_cast_likes_more'].all()

False

In [63]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')

In [64]:
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

In [65]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= \
                                  movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all()

True

In [66]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / 
                                movie['cast_total_facebook_likes'])

In [67]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

(0.0, 1.0)

## There's more...

In [68]:
profit_index = movie.columns.get_loc('gross') + 1

In [69]:
movie.insert(8, 'profit', movie.gross - movie.budget)

In [70]:
movie

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,movie_facebook_likes,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,...,33000,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,...,85000,True,11554.0,0.987521
...,...,...,...,...,...,...,...,...,...
4913,Color,Benjamin Roberds,13.0,76.0,...,16,True,0.0,
4914,Color,Daniel Hsia,14.0,100.0,...,660,True,2154.0,0.902766
4915,Color,Jon Gunn,43.0,90.0,...,456,True,125.0,0.766871


In [71]:
movie

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,movie_facebook_likes,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
0,Color,James Cameron,723.0,178.0,...,33000,True,2791.0,0.577369
1,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
2,Color,Sam Mendes,602.0,148.0,...,85000,True,11554.0,0.987521
...,...,...,...,...,...,...,...,...,...
4913,Color,Benjamin Roberds,13.0,76.0,...,16,True,0.0,
4914,Color,Daniel Hsia,14.0,100.0,...,660,True,2154.0,0.902766
4915,Color,Jon Gunn,43.0,90.0,...,456,True,125.0,0.766871
