In [1]:
import pandas as pd
from pandas import Series, DataFrame
# We can explictly import Series and DataFrame, why might we do this?

###  Series Review


#### Series from `list`

In [2]:
scores_list = [54, 22, 19, 73, 80]
scores_series = Series(scores_list)
scores_series

# What is the terminology for:  0, 1, 2, ... ??       A:  index
# What is the terminology for:  54, 22, 19, .... ??   A:  value

0    54
1    22
2    19
3    73
4    80
dtype: int64

#### Selecting certain scores.
What are all the scores `> 50`?

In [3]:
scores_series > 50

0     True
1    False
2    False
3     True
4     True
dtype: bool

**Answer:** Boolean indexing. Try the following...

In [4]:
scores_series[[True, True, False, False, True]] # often called a "mask"

0    54
1    22
4    80
dtype: int64

We are really writing a "mask" for our data.

In [5]:
scores_series[scores_series > 50]

0    54
3    73
4    80
dtype: int64

#### Series from `dict`

In [6]:
# Imagine we hire students and track their weekly hours
week1 = Series({"Rita": 5, "Therese": 3, "Janice": 6})
week2 = Series({"Rita": 3, "Therese": 7, "Janice": 4})
week3 = Series({"Therese": 5, "Janice": 5, "Rita": 8}) # Wrong order! Will this matter?
print(week1)
print(week2)
print(week3)

Rita       5
Therese    3
Janice     6
dtype: int64
Rita       3
Therese    7
Janice     4
dtype: int64
Therese    5
Janice     5
Rita       8
dtype: int64


####  For everyone in Week 1, add 3 to their hours 

In [7]:
week1 = week1 + 3
week1

Rita       8
Therese    6
Janice     9
dtype: int64

#### Total up everyone's hours

In [8]:
total_hours = week1 + week2 + week3
total_hours

Janice     18
Rita       19
Therese    18
dtype: int64

#### What is week1 / week3 ?

In [9]:
week1 / week3
# Notice that we didn't have to worry about the order of indices

Janice     1.8
Rita       1.0
Therese    1.2
dtype: float64

#### What type of values are stored in  week1 > week2?

In [10]:
print(week1)
print(week2)
week1 > week2 
# Notice that indices are ordered the same

Rita       8
Therese    6
Janice     9
dtype: int64
Rita       3
Therese    7
Janice     4
dtype: int64


Rita        True
Therese    False
Janice      True
dtype: bool

####  What is week1 > week3?

In [11]:
print(week1)
print(week3)
# week1 > week3 # Does not work (ValueError) because indices are not in same order

# How can we fix this?
week1.sort_index() > week3.sort_index()

Rita       8
Therese    6
Janice     9
dtype: int64
Therese    5
Janice     5
Rita       8
dtype: int64


Janice      True
Rita       False
Therese     True
dtype: bool


# Lecture 28:  Pandas 2 - DataFrames


Learning Objectives:
- Create a DataFrame from 
 - a dictionary of Series, lists, or dicts
 - a list of Series, lists, dicts
- Select a column, row, cell, or rectangular region of a DataFrame
- Convert CSV files into DataFrames and DataFrames into CSV Files
- Access the head or tail of a DataFrame

**Big Idea**: Data Frames store 2-dimensional data in tables! It is a collection of Series.

## You can create a DataFrame in a variety of ways!

- dictionary of Series
- dictionary of lists
- dictionary of dictionaries
- list of dictionarines
- list of lists

### From a dictionary of Series

In [12]:
names = Series(["Alice", "Bob", "Cindy", "Dan"])
scores = Series([6, 7, 8, 9])

# to make a dictionary of Series, need to write column names for the keys
DataFrame({
    "Player name": names,
    "Score": scores
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### From a dictionary of lists

In [13]:
name_list = ["Alice", "Bob", "Cindy", "Dan"]
score_list = [6, 7, 8, 9]

# this is the same as above, reminding us that Series act like lists
DataFrame({
    "Player name": name_list,
    "Score": score_list
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### From a dictionary of dictionaries
We need to make up keys to match the things in each column

In [14]:
data = {
    "Player name": {0: "Alice", 1: "Bob", 2: "Cindy", 3: "Dan"},
    "Score": {0: 6, 1: 7, 2: 8, 3: 9}
}
DataFrame(data)

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### From a list of dicts

In [15]:
data = [
    {"Player name": "Alice", "Score": 6},
    {"Player name": "Bob", "Score": 7},
    {"Player name": "Cindy", "Score": 8},
    {"Player name": "Dan", "Score": 9}
]
DataFrame(data)

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### From a list of lists

In [16]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data)

Unnamed: 0,0,1
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### Explicitly naming the columns
We have to add the column names, we do this with `columns = [name1, name2, ....]` 

In [17]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data, columns=["Player name", "Score"])

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### Explicitly naming the indices
We can use `index = [name1, name2, ...]` to rename the index of each row

In [18]:
data = [
    {"Player name": "Alice", "Score": 6},
    {"Player name": "Bob", "Score": 7},
    {"Player name": "Cindy", "Score": 8},
    {"Player name": "Dan", "Score": 9}
]
DataFrame(data, index = ["A", "B", "C", "D"]) # must have a name for each row

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


### Explicitly naming the columns

In [19]:
# You try: 
# Make a DataFrame of 4 people you know with different ages
# Give names to both the columns and rows
ages = [
    ["Hope", 10],
    ["Peace", 7],
    ["Joy", 4],
    ["Love", 11]
]
DataFrame(ages, index = ["A", "B", "C", "D"], columns = ["Name", "Age"])

# Share how you did with this with your neighbor
# If you both did it the same way, try it a different way.

Unnamed: 0,Name,Age
A,Hope,10
B,Peace,7
C,Joy,4
D,Love,11


## Select a column, row, cell, or rectangular region of a DataFrame
### Data lookup: Series
- `s.loc[X]`   <- lookup by pandas index
- `s.iloc[X]`  <- lookup by integer position

In [20]:
hours = Series({"Alice":6, "Bob":7, "Cindy":8, "Dan":9})
hours

Alice    6
Bob      7
Cindy    8
Dan      9
dtype: int64

In [21]:
# Lookup Bob's hours by pandas index.
hours.loc["Bob"]

7

In [22]:
# Lookup Bob's hours by integer position.
hours.iloc[2]

8

In [23]:
# Lookup Cindy's hours by pandas index.
hours.loc["Cindy"]

8

###  Data lookup: DataFrame


- `d.loc[r]`     lookup ROW by pandas ROW index
- `d.iloc[r]`    lookup ROW by ROW integer position
- `d[c]`         lookup COL by pandas COL index
- `d.loc[r, c]`  lookup by pandas ROW index and pandas COL index
- `d.iloc[r, c]`  lookup by ROW integer position and COL integer position

In [24]:
# We often call the object that we make df
data = [
    ["Hope", 10],
    ["Peace", 7],
    ["Joy", 4],
    ["Love", 11]
]
df = DataFrame(data, index = ["H", "P", "J", "L"], columns = ["Player name", "Score"])
df

Unnamed: 0,Player name,Score
H,Hope,10
P,Peace,7
J,Joy,4
L,Love,11


### What are 3 different ways of accessing row L? 

In [25]:
#df["L"] # Nope!
print(df.loc["L"])
print(df.iloc[3])
print(df.iloc[-1])

Player name    Love
Score            11
Name: L, dtype: object
Player name    Love
Score            11
Name: L, dtype: object
Player name    Love
Score            11
Name: L, dtype: object


### How about accessing a column?

In [26]:
df

Unnamed: 0,Player name,Score
H,Hope,10
P,Peace,7
J,Joy,4
L,Love,11


In [27]:
print(df["Player name"])
#df[0] # Doesn't work!

H     Hope
P    Peace
J      Joy
L     Love
Name: Player name, dtype: object


### What are 3 different ways to access a single cell?

In [28]:
df

Unnamed: 0,Player name,Score
H,Hope,10
P,Peace,7
J,Joy,4
L,Love,11


In [29]:
# How to access Cindy?
#print(df["C", "Player name"]) # Nope!
print(df.loc["J", "Player name"])
print(df["Player name"].loc["J"])
print(df.iloc[2, 0])

Joy
Joy
Joy


## How to set values for a specific entry?

- `d.loc[r, c] = new_val`
- `d.iloc[r, c] = new_val`

In [30]:
#change player D's name
df.loc["L", "Player name"] = "Luisa"
df

Unnamed: 0,Player name,Score
H,Hope,10
P,Peace,7
J,Joy,4
L,Luisa,11


In [31]:
# then add 3 to that player's score using .loc
df.loc["L","Score"] += 3
df

Unnamed: 0,Player name,Score
H,Hope,10
P,Peace,7
J,Joy,4
L,Luisa,14


In [32]:
# add 7 to a different player's score using .iloc
df.iloc[0, 1] += 7
df

Unnamed: 0,Player name,Score
H,Hope,17
P,Peace,7
J,Joy,4
L,Luisa,14


### Find the max score and the mean score

In [33]:
# find the max and mean of the "Score" column
print(df["Score"].max(), df["Score"].mean())

17 10.5


### Find the highest scoring player

In [34]:
df

Unnamed: 0,Player name,Score
H,Hope,17
P,Peace,7
J,Joy,4
L,Luisa,14


In [35]:
highest_scorer = df["Score"].idxmax()
df["Player name"].loc[highest_scorer]

'Hope'

##  Slicing a DataFrame

- `df.iloc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using integer positions
- `df.loc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using index

In [36]:
df.iloc[1:3, 0:2]

Unnamed: 0,Player name,Score
P,Peace,7
J,Joy,4


In [37]:
df.loc["P":"J", "Player name":"Score"] # notice that this way is inclusive of endpoints

Unnamed: 0,Player name,Score
P,Peace,7
J,Joy,4


## Set values for sliced DataFrame

- `d.loc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW INDEX and COL INDEX
- `d.iloc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW Integer position and COL Integer position

In [38]:
df

Unnamed: 0,Player name,Score
H,Hope,17
P,Peace,7
J,Joy,4
L,Luisa,14


In [39]:
df.loc["P":"J", "Score"] += 5
df

Unnamed: 0,Player name,Score
H,Hope,17
P,Peace,12
J,Joy,9
L,Luisa,14


### Pandas allows slicing of non-contiguous columns

In [40]:
# just get Player name for Index B and D
df.loc[["P", "L"],"Player name"]

P    Peace
L    Luisa
Name: Player name, dtype: object

In [41]:
# add 2 to the people in rows B and D
df.loc[["P", "L"],"Score"] += 2
df

Unnamed: 0,Player name,Score
H,Hope,17
P,Peace,14
J,Joy,9
L,Luisa,16


## Boolean indexing on a DataFrame

- `d[BOOL SERIES]`  <- makes a new DF of all rows that lined up were True

In [42]:
df

Unnamed: 0,Player name,Score
H,Hope,17
P,Peace,14
J,Joy,9
L,Luisa,16


### Make a Series of Booleans based on Score >= 15

In [43]:
b = df["Score"] >= 15
b

H     True
P    False
J    False
L     True
Name: Score, dtype: bool

### use b to slice the DataFrame
if b is true, include this row in the new df

In [44]:
df[b]

Unnamed: 0,Player name,Score
H,Hope,17
L,Luisa,16


### do the last two things in a single step

In [45]:
df[df["Score"] >= 15]

Unnamed: 0,Player name,Score
H,Hope,17
L,Luisa,16


## Creating DataFrame from csv

In [46]:
# it's that easy!  
df = pd.read_csv("IMDB-Movie-Data.csv")
df

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...,...
1063,1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81
1064,1064,Baby Driver,"Action, Crime, Drama",Edgar Wright,"Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...",2017,113,7.6,107.83
1065,1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58


###   View the first few lines of the DataFrame
- `.head(n)` gets the first n lines, 5 is the default

In [47]:
df.head()

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02


### get the first 2 rows

In [48]:
df.head(2)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M


###   View the first few lines of the DataFrame
- `.tail(n)` gets the last n lines, 5 is the default

In [49]:
df.tail()

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
1063,1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81
1064,1064,Baby Driver,"Action, Crime, Drama",Edgar Wright,"Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...",2017,113,7.6,107.83
1065,1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58
1067,1067,A Star Is Born,"Drama, Music, Romance",Bradley Cooper,"Lady Gaga, Bradley Cooper, Sam Elliott, Greg G...",2018,136,7.6,215.29


In [50]:
df.tail(3)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
1065,1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58
1067,1067,A Star Is Born,"Drama, Music, Romance",Bradley Cooper,"Lady Gaga, Bradley Cooper, Sam Elliott, Greg G...",2018,136,7.6,215.29


### What are the first and the last years in our dataset?

In [51]:
print("First year: {}, Last year: {}".format(df["Year"].min(), df["Year"].max()))

First year: 2006, Last year: 2020


In [52]:
### What are the rows that correspond to movies whose title contains "Harry" ? 
df[df["Title"].str.contains("Harry")]

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
114,114,Harry Potter and the Deathly Hallows: Part 2,"Adventure,Drama,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, M...",2011,130,8.1,380.96
314,314,Harry Potter and the Order of the Phoenix,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, B...",2007,138,7.5,292.0
417,417,Harry Potter and the Deathly Hallows: Part 1,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, B...",2010,146,7.7,294.98
472,472,Harry Potter and the Half-Blood Prince,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, M...",2009,153,7.5,301.96


### What is the movie at index 6 ? 

In [53]:
df.iloc[6]

Index                                                       6
Title                                              La La Land
Genre                                      Comedy,Drama,Music
Director                                      Damien Chazelle
Cast        Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....
Year                                                     2016
Runtime                                                   128
Rating                                                    8.3
Revenue                                               151.06M
Name: 6, dtype: object

In [54]:
df

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...,...
1063,1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81
1064,1064,Baby Driver,"Action, Crime, Drama",Edgar Wright,"Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...",2017,113,7.6,107.83
1065,1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58


## Notice that there are two index columns
- That happened because when you write a csv from pandas to a file, it writes a new index column
- So if the dataFrame already contains an index, you are going to get two index columns
- Let's fix that problem

### How can you use slicing to get just columns with Title and Year?

In [55]:
df2 = df[["Title", "Year"]]
df2
# notice that this does not have the 'index' column

Unnamed: 0,Title,Year
0,Guardians of the Galaxy,2014
1,Prometheus,2012
2,Split,2016
3,Sing,2016
4,Suicide Squad,2016
...,...,...
1063,Guardians of the Galaxy Vol. 2,2017
1064,Baby Driver,2017
1065,Only the Brave,2017
1066,Incredibles 2,2018


### How can you use slicing to get rid of the first column?

In [56]:
df = df.iloc[:, 1:] #all the rows, not column 0
df

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...
1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81
1064,Baby Driver,"Action, Crime, Drama",Edgar Wright,"Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...",2017,113,7.6,107.83
1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58


### Write a df to a csv file

In [57]:
df.to_csv("better_movies.csv", index = False)

## Practice on your own.....Data Analysis with Data Frames


### What are all the movies that have above average run time (long movies)? 

In [58]:
long_movies = df [df["Runtime"] > df["Runtime"].mean()]
long_movies

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
6,La La Land,"Comedy,Drama,Music",Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,151.06M
...,...,...,...,...,...,...,...,...
1060,Just Mercy,"Biography, Crime, Drama",Destin Daniel Cretton,"Michael B. Jordan, Jamie Foxx, Brie Larson, Ch...",2019,137,7.6,50.4
1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81
1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58


### Which long movie has the lowest rating?

In [59]:
min_rating = long_movies["Rating"].min()
min_rating

3.2

In [60]:
# Which movies had this min rating?
long_movies[long_movies["Rating"] == min_rating]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
646,Tall Men,"Fantasy,Horror,Thriller",Jonathan Holbrook,"Dan Crisafulli, Kay Whitney, Richard Garcia, P...",2016,133,3.2,0


### What are all long movies with someone in the cast named "Emma" ? 

In [61]:
long_movies[long_movies["Cast"].str.contains("Emma")]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
6,La La Land,"Comedy,Drama,Music",Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,151.06M
92,The Help,Drama,Tate Taylor,"Emma Stone, Viola Davis, Octavia Spencer, Bryc...",2011,146,8.1,169.71M
114,Harry Potter and the Deathly Hallows: Part 2,"Adventure,Drama,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, M...",2011,130,8.1,380.96
157,"Crazy, Stupid, Love.","Comedy,Drama,Romance",Glenn Ficarra,"Steve Carell, Ryan Gosling, Julianne Moore, Em...",2011,118,7.4,84.24
253,The Amazing Spider-Man 2,"Action,Adventure,Sci-Fi",Marc Webb,"Andrew Garfield, Emma Stone, Jamie Foxx, Paul ...",2014,142,6.7,202.85
314,Harry Potter and the Order of the Phoenix,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, B...",2007,138,7.5,292
367,The Amazing Spider-Man,"Action,Adventure",Marc Webb,"Andrew Garfield, Emma Stone, Rhys Ifans, Irrfa...",2012,136,7.0,262.03
417,Harry Potter and the Deathly Hallows: Part 1,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, B...",2010,146,7.7,294.98
472,Harry Potter and the Half-Blood Prince,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, M...",2009,153,7.5,301.96
609,Beautiful Creatures,"Drama,Fantasy,Romance",Richard LaGravenese,"Alice Englert, Viola Davis, Emma Thompson,Alde...",2013,124,6.2,19.45


### What is the title of the shortest movie?

In [62]:
df[df["Runtime"] == df["Runtime"].min()]["Title"]

792    Ma vie de Courgette
Name: Title, dtype: object

### What movie had the highest revenue?

In [63]:
# What movie had the highest revenue?
# df["Revnue"].max() did not work
# we need to clean our data

def format_revenue(revenue):
    #TODO: Check the last character of the string
    if type(revenue) == float: # need this in here if we run code multiple times
        return revenue
    elif revenue[-1] == 'M': # some have an "M" at the end
        return float(revenue[:-1]) * 1e6
    else:
        return float(revenue) * 1e6

In [64]:
# What movie had the highest revenue?
revenue = df["Revenue"].apply(format_revenue) # apply a function to a column
print(revenue.head())
max_revenue = revenue.max()

# make a copy of our df
rev_df = df.copy()
rev_df["Revenue (float)"] = revenue
rev_df

0    333130000.0
1    126460000.0
2    138120000.0
3    270320000.0
4    325020000.0
Name: Revenue, dtype: float64


Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue,Revenue (float)
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13,333130000.0
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M,126460000.0
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M,138120000.0
3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32,270320000.0
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02,325020000.0
...,...,...,...,...,...,...,...,...,...
1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81,389810000.0
1064,Baby Driver,"Action, Crime, Drama",Edgar Wright,"Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...",2017,113,7.6,107.83,107830000.0
1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34,18340000.0
1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58,608580000.0


In [65]:
# Now we can answer the question!
rev_df[rev_df["Revenue (float)"] == max_revenue]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue,Revenue (float)
50,Star Wars: Episode VII - The Force Awakens,"Action,Adventure,Fantasy",J.J. Abrams,"Daisy Ridley, John Boyega, Oscar Isaac, Domhna...",2015,136,8.1,936.63,936630000.0


In [66]:
# Or more generally...
rev_df.sort_values(by="Revenue (float)", ascending=False)

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue,Revenue (float)
50,Star Wars: Episode VII - The Force Awakens,"Action,Adventure,Fantasy",J.J. Abrams,"Daisy Ridley, John Boyega, Oscar Isaac, Domhna...",2015,136,8.1,936.63,936630000.0
1006,Avengers: Endgame,"Action, Adventure, Drama",Anthony Russo,"Joe Russo, Robert Downey Jr., Chris Evans, Mar...",2019,181,8.4,858.37,858370000.0
87,Avatar,"Action,Adventure,Fantasy",James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",2009,162,7.8,760.51,760510000.0
1007,Avengers: Infinity War,"Action, Adventure, Sci-Fi",Anthony Russo,"Joe Russo, Robert Downey Jr., Chris Hemsworth,...",2018,149,8.4,678.82,678820000.0
85,Jurassic World,"Action,Adventure,Sci-Fi",Colin Trevorrow,"Chris Pratt, Bryce Dallas Howard, Ty Simpkins,...",2015,124,7.0,652.18,652180000.0
...,...,...,...,...,...,...,...,...,...
974,Dark Places,"Drama,Mystery,Thriller",Gilles Paquet-Brenner,"Charlize Theron, Nicholas Hoult, Christina Hen...",2015,113,6.2,0,0.0
183,Realive,Sci-Fi,Mateo Gil,"Tom Hughes, Charlotte Le Bon, Oona Chaplin, Ba...",2016,112,5.9,0,0.0
218,A Dark Song,"Drama,Horror",Liam Gavin,"Mark Huberman, Susan Loughnane, Steve Oram,Cat...",2016,100,6.1,0,0.0
397,Absolutely Anything,"Comedy,Sci-Fi",Terry Jones,"Simon Pegg, Kate Beckinsale, Sanjeev Bhaskar, ...",2015,85,6.0,0,0.0


In [67]:
df

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...
1063,Guardians of the Galaxy Vol. 2,"Action, Adventure, Comedy",James Gunn,"Chris Pratt, Zoe Saldana, Dave Bautista, Vin D...",2017,136,7.6,389.81
1064,Baby Driver,"Action, Crime, Drama",Edgar Wright,"Ansel Elgort, Jon Bernthal, Jon Hamm, Eiza Gon...",2017,113,7.6,107.83
1065,Only the Brave,"Action, Biography, Drama",Joseph Kosinski,"Josh Brolin, Miles Teller, Jeff Bridges, Jenni...",2017,134,7.6,18.34
1066,Incredibles 2,"Animation, Action, Adventure",Brad Bird,"Craig T. Nelson, Holly Hunter, Sarah Vowell, H...",2018,118,7.6,608.58


### What is the average runtime for movies by "Francis Lawrence"?

In [68]:
fl_movies = df[df["Director"] == "Francis Lawrence"]
fl_movies["Runtime"].mean()

126.75

### Which director had the highest average rating? 

In [69]:
# one way is to make a python dict of director, list of ratings
director_dict = dict()

# make the dictionary: key is director, value is list of ratings
for i in range(len(df)):
    director = df.loc[i, "Director"]
    rating = df.loc[i, "Rating"]
    #print(i, director, rating)
    if director not in director_dict:
        director_dict[director] = []
    director_dict[director].append(rating)

# make a ratings dict key is directory, value is average
# only include directors with > 4 movies
ratings_dict = {k: sum(v) / len(v) for (k, v) in director_dict.items() if len(v) > 4}

#sort a dict by values
dict(sorted(ratings_dict.items(), key = lambda t:t[-1], reverse = True))

{'Christopher Nolan': 8.533333333333333,
 'Martin Scorsese': 7.916666666666667,
 'Quentin Tarantino': 7.840000000000001,
 'David Fincher': 7.8199999999999985,
 'Denis Villeneuve': 7.8,
 'J.J. Abrams': 7.58,
 'Guy Ritchie': 7.5,
 'David Yates': 7.433333333333334,
 'Danny Boyle': 7.42,
 'Antoine Fuqua': 7.040000000000001,
 'Zack Snyder': 7.040000000000001,
 'Woody Allen': 7.019999999999999,
 'Peter Berg': 6.860000000000001,
 'Ridley Scott': 6.85,
 'Justin Lin': 6.82,
 'Michael Bay': 6.483333333333334,
 'Paul W.S. Anderson': 5.766666666666666,
 'M. Night Shyamalan': 5.533333333333332}

In [70]:
# FOR DEMONSTRATION PURPOSES ONLY
# We haven't learnt about "groupby"
# Pandas has many operations which will be helpful!

# Consider what you already know, and what Pandas can solve
# when formulating your solutions.
rating_groups = df.groupby("Director")["Rating"]
rating_groups.mean()[rating_groups.count() > 4].sort_values(ascending=False)

Director
Christopher Nolan     8.533333
Martin Scorsese       7.916667
Quentin Tarantino     7.840000
David Fincher         7.820000
Denis Villeneuve      7.800000
J.J. Abrams           7.580000
Guy Ritchie           7.500000
David Yates           7.433333
Danny Boyle           7.420000
Antoine Fuqua         7.040000
Zack Snyder           7.040000
Woody Allen           7.020000
Peter Berg            6.860000
Ridley Scott          6.850000
Justin Lin            6.820000
Michael Bay           6.483333
Paul W.S. Anderson    5.766667
M. Night Shyamalan    5.533333
Name: Rating, dtype: float64

In [None]:
# Extra Practice: Make up some of your own questions about the movies