# Monday, Nov 15
## Pandas 2 - DataFrames

Core ideas:
 - DataFrames are tables, 
     - meant to be similar to Spreadsheets
     - each column in the table is a Series


## A. Series Review
## Data alignment  (element-wise operation: series op series)

In [2]:
import pandas as pd
from pandas import Series, DataFrame

# double importing allows us flexibility in naming

## Volunteer Hours

In [3]:
# We can make a Series from a Python dict
week1= Series({"Rita":5, "Therese":3, "Janice": 6})
week2 = Series({"Rita":3, "Therese":7, "Janice": 4})
week3 = Series({"Therese":5, "Janice":5, "Rita": 8})   # wrong order!
print(week1)
print(week2)
print(week3)

Rita       5
Therese    3
Janice     6
dtype: int64
Rita       3
Therese    7
Janice     4
dtype: int64
Therese    5
Janice     5
Rita       8
dtype: int64


## Give everyone 3 more hours in Week 1

In [4]:
week1 = week1 + 3
week1

Rita       8
Therese    6
Janice     9
dtype: int64

## What is week1 +  week2 + week3?

In [5]:
week1 + week2 + week3

Janice     18
Rita       19
Therese    18
dtype: int64

## What is week1 / week2 ?

In [6]:
week1 / week2
# notice that we didn't have to worry about the order of indices

Rita       2.666667
Therese    0.857143
Janice     2.250000
dtype: float64

## What is week1 > week2?

In [7]:
print(week1)
print(week2)
week1 > week2 # indices are ordered the same

Rita       8
Therese    6
Janice     9
dtype: int64
Rita       3
Therese    7
Janice     4
dtype: int64


Rita        True
Therese    False
Janice      True
dtype: bool

## What is week1 > week3?

In [8]:
print(week1)
print(week3)
week1 > week3 # indices not in same order

Rita       8
Therese    6
Janice     9
dtype: int64
Therese    5
Janice     5
Rita       8
dtype: int64


ValueError: Can only compare identically-labeled Series objects

## There is a method called .gt, for Greater Than

In [170]:
week1
week3
week1.gt(week3)

Janice      True
Rita       False
Therese     True
dtype: bool

## eq(=), ne (!=), ge (>=), le (<=),   are the other options

In [9]:
print(week1)
print(week3)
print(week1.eq(week3))
print(week1.ne(week3))
print(week1.ge(week3))
print(week1.le(week3))

Rita       8
Therese    6
Janice     9
dtype: int64
Therese    5
Janice     5
Rita       8
dtype: int64
Janice     False
Rita        True
Therese    False
dtype: bool
Janice      True
Rita       False
Therese     True
dtype: bool
Janice     True
Rita       True
Therese    True
dtype: bool
Janice     False
Rita        True
Therese    False
dtype: bool



# Data Frames store 2-dimensional data in tables

## B. A DataFrame can be created from:
1. dict of Series
2. dict of lists
3. list of lists
4. dict of dicts
5. list of dicts

### DataFrame from dictionary of Series

In [10]:
col1 = Series(["Alice", "Bob", "Cindy", "Dan"])
col2 = Series([6, 7, 8, 9])
# to make a dictionary of Series, need to write column names for the keys
DataFrame({
    "Player name": col1,
    "Score": col2
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from dictionary of lists

In [11]:
name_list = ["Alice", "Bob", "Cindy", "Dan"]
score_list = [6, 7, 8, 9]
# this is the same as above, reminding us that Series act like lists
DataFrame({
    "Player name": name_list,
    "Score": score_list
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from list of lists

In [12]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
data
DataFrame(data)
#notice this DataFrame has no column names....

Unnamed: 0,0,1
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


In [13]:
# Reminder:  A series can be made from a dict
Series({0: "Alice", 1: "Bob", 2: "Cindy", 3: "Dan"})

0    Alice
1      Bob
2    Cindy
3      Dan
dtype: object

### DataFrame from dictionary of dicts

In [14]:
# do you see how this is just like the previous examples?

data = {
    "Player name": {0: "Alice", 1: "Bob", 2: "Cindy", 3: "Dan"},
    "Score": {0: 6, 1: 7, 2: 8, 3: 9}
}
DataFrame(data)

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from list of dicts

In [15]:
data = [
    {"Player name": "Alice", "Score": 6},
    {"Player name": "Bob", "Score": 7},
    {"Player name": "Cindy", "Score": 8},
    {"Player name": "Dan", "Score": 9}
]
data
DataFrame(data)

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


In [16]:
# We can use keyword arguments to rename the index of each row
data = [
    {"Player name": "Alice", "Score": 6},
    {"Player name": "Bob", "Score": 7},
    {"Player name": "Cindy", "Score": 8},
    {"Player name": "Dan", "Score": 9}
]
data
DataFrame(data, index=["A", "B", "C", "D"]) # must have a name for each row

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


### Naming the Columns

In [17]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data, columns=["Player name", "Score"])


Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


In [18]:
# Give names to both the columns and rows
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
data

[['Alice', 6], ['Bob', 7], ['Cindy', 8], ['Dan', 9]]

## 3. Rules of Data Lookup:
## Data lookup: Series
- s.loc[X]   <- lookup by pandas index
- s.iloc[X]  <- lookup by integer position
- s[X]       <- depends (first try index, use integer position if necessary)

In [67]:
col1 = Series({"Alice":6, "Bob":7, "Cindy":8, "Dan":9})
col1

Alice    6
Bob      7
Cindy    8
Dan      9
dtype: int64

In [68]:
col1.loc["Bob"] #Series index

7

In [69]:
col1.iloc[2] #Series integer position

8

In [70]:
col1["Cindy"] #Series index

8

In [71]:
col1[1] #Series integer position 
#No conflict between index and integer position in this example!

7

## Data lookup

### Series
- s.loc[X]   <- lookup by pandas index
- s.iloc[X]  <- lookup by integer position
- s[X]       <- depends (first try index, use integer position if necessary)

### DataFrame

- `d.loc[r]`     lookup ROW by pandas ROW index
- `d.iloc[r]`    lookup ROW by ROW integer position
- `d[c]`         lookup COL by COL index
- `d.loc[r, c]`  lookup by ROW index and COL index
- `d.iloc[r, c]`  lookup by ROW integer position and COL integer position

In [75]:
# we often call the object that we make df
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
df = DataFrame(data, index=["A", "B", "C", "D"], columns = ["Player name", "Score"])
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


### What are all the different ways of accessing row D?

In [97]:
#df["D"] # Nope!
print(df.loc["D"])
print(df.iloc[3])
print(df.iloc[-1])

Player name    Dan
Score            9
Name: D, dtype: object
Player name    Dan
Score            9
Name: D, dtype: object
Player name    Dan
Score            9
Name: D, dtype: object


In [86]:
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


### What is the only  way to access a column?

In [98]:
#df[0] # Nope!
print(df["Player name"])

A    Alice
B      Bob
C    Cindy
D      Dan
Name: Player name, dtype: object


### What are the ways to access a single cell?

In [87]:
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


In [94]:
# How to access Cindy?
#print(df["C", "Player name"]) # Nope!
print(df.loc["C", "Player name"])
print(df.iloc[2, 0])
print(df.iloc[-2, 0])

Cindy
Cindy
Cindy


## How to set values for a specific entry?

- d.loc[r, c] = new_val
- d.iloc[r, c] = new_val 

In [102]:
df.loc["D", "Player name"] = "Bianca"
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Bianca,9


In [105]:
df.loc["B","Score"] += 3
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,10
C,Cindy,8
D,Bianca,9


In [106]:
df.iloc[-1,1] += 2
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,10
C,Cindy,8
D,Bianca,11


## How to compute max score of a column?

In [113]:
print(df["Score"].max(), df["Score"].mean())


11 8.75


## Slicing DataFrame

- df.iloc[ROW_SLICE, COL_SLICE] <- take a rectangular slice from the DataFrame using integer positions
- df.loc[ROW_SLICE, COL_SLICE] <- take a rectangular slice from the DataFrame using index

In [114]:
df.iloc[1:3, 0:2]

Unnamed: 0,Player name,Score
B,Bob,10
C,Cindy,8


In [117]:
df.loc["B":"C", "Player name":"Score"] # notice that its inclusive

Unnamed: 0,Player name,Score
B,Bob,10
C,Cindy,8


## How to set values for sliced DataFrame?

- d.loc[ROW_SLICE, COL_SLICE] = new_val <- set value by ROW INDEX and COL INDEX
- d.iloc[ROW_SLICE, COL_SLICE] = new_val <- set value by ROW Integer position and COL Integer position

In [118]:
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,10
C,Cindy,8
D,Bianca,11


In [120]:

df.loc["B":"C", "Score"] += 5
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,15
C,Cindy,13
D,Bianca,11


## Instead of a slice, you could use a list of indexes or integer positions.

In [121]:

df.loc[["B", "D"],"Player name"]

B       Bob
D    Bianca
Name: Player name, dtype: object

In [123]:

df.loc[["B", "D"],"Score"] += 2

## Boolean indexing

### Series
- s[BOOL SERIES]  <- gets all s values lined up with True

### DataFrame
- d[BOOL SERIES]  <- pulls out rows lined up with True

In [126]:
df

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,17
C,Cindy,13
D,Bianca,13


In [127]:
b = df["Score"] >= 15
b


A    False
B     True
C    False
D    False
Name: Score, dtype: bool

In [128]:
df[b]

Unnamed: 0,Player name,Score
B,Bob,17


In [129]:
df[df["Score"] >= 15]

Unnamed: 0,Player name,Score
B,Bob,17


## 4. Creating DataFrame from csv

In [19]:
# its that easy!  
df = pd.read_csv("IMDB-Movie-Data.csv")
df

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...,...
993,993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0
994,994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0


## How to see first few lines of the DataFrame?

In [131]:
df.head()

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02


In [132]:
df.head(2)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M


## How to see last few lines of the DataFrame?

In [133]:
df.tail()

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
993,993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0.0
994,994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0.0
997,997,Nine Lives,"Comedy,Family,Fantasy",Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,19.64


In [135]:
df.tail(3)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0.0
997,997,Nine Lives,"Comedy,Family,Fantasy",Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,19.64


In [141]:
print(df["Year"].min(), df["Year"].max(), sep="\t")

2006	2016


In [148]:
df[df["Title"] == "La La Land" ]

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
6,6,La La Land,"Comedy,Drama,Music",Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,151.06M


In [20]:
df[df["Title"] == "The Imitation Game" ]

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
198,198,The Imitation Game,"Biography,Drama,Thriller",Morten Tyldum,"Benedict Cumberbatch, Keira Knightley, Matthew...",2014,114,8.1,91.12


In [151]:
df.iloc[6]

Index                                                       6
Title                                              La La Land
Genre                                      Comedy,Drama,Music
Director                                      Damien Chazelle
Cast        Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....
Year                                                     2016
Runtime                                                   128
Rating                                                    8.3
Revenue                                               151.06M
Name: 6, dtype: object

In [152]:
df

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...,...
993,993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0
994,994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0


## Notice that there are two index columns
- That happened because when you write a csv from pandas to a file, it writes a new index column
- So if the dataFrame already contains an index, you are going to get two index columns
- Let's fix that problem

### How can you use slicing to get rid of the first column?

In [21]:
df2 =  df.iloc[:, 1:] #all the rows, not column 0
df2

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...
993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0
994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0


### Wrong way to write a df to a csv file

In [156]:
df2.to_csv("wrong_movies.csv")

### Correct way to write a df to a csv file

In [157]:
df2.to_csv("better_movies.csv", index = False)

## 5. Data Analysis with Data Frames


In [24]:
##What is the highest rated movie that had an above average runtime?
long_movies = df[df["Runtime"] > df["Runtime"].mean() ]
long_movies.head(10)


Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
6,6,La La Land,"Comedy,Drama,Music",Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,151.06M
8,8,The Lost City of Z,"Action,Adventure,Biography",James Gray,"Charlie Hunnam, Robert Pattinson, Sienna Mille...",2016,141,7.1,8.01
9,9,Passengers,"Adventure,Drama,Romance",Morten Tyldum,"Jennifer Lawrence, Chris Pratt, Michael Sheen,...",2016,116,7.0,100.01M
10,10,Fantastic Beasts and Where to Find Them,"Adventure,Family,Fantasy",David Yates,"Eddie Redmayne, Katherine Waterston, Alison Su...",2016,133,7.5,234.02
11,11,Hidden Figures,"Biography,Drama,History",Theodore Melfi,"Taraji P. Henson, Octavia Spencer, Janelle Mon...",2016,127,7.8,169.27M
12,12,Rogue One,"Action,Adventure,Sci-Fi",Gareth Edwards,"Felicity Jones, Diego Luna, Alan Tudyk, Donnie...",2016,133,7.9,532.17


In [25]:
highest_rating = long_movies["Rating"].max()
highest_rating

9.0

In [164]:
long_movies[long_movies["Rating"] == highest_rating]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
54,The Dark Knight,"Action,Crime,Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart,Mi...",2008,152,9.0,533.32


In [165]:
long_movies.loc[54, "Cast"]

'Christian Bale, Heath Ledger, Aaron Eckhart,Michael Caine'

In [168]:
long_movies[long_movies["Cast"].str.contains("Emma")]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
6,La La Land,"Comedy,Drama,Music",Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,151.06M
92,The Help,Drama,Tate Taylor,"Emma Stone, Viola Davis, Octavia Spencer, Bryc...",2011,146,8.1,169.71M
114,Harry Potter and the Deathly Hallows: Part 2,"Adventure,Drama,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, M...",2011,130,8.1,380.96
157,"Crazy, Stupid, Love.","Comedy,Drama,Romance",Glenn Ficarra,"Steve Carell, Ryan Gosling, Julianne Moore, Em...",2011,118,7.4,84.24
253,The Amazing Spider-Man 2,"Action,Adventure,Sci-Fi",Marc Webb,"Andrew Garfield, Emma Stone, Jamie Foxx, Paul ...",2014,142,6.7,202.85
314,Harry Potter and the Order of the Phoenix,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, B...",2007,138,7.5,292
367,The Amazing Spider-Man,"Action,Adventure",Marc Webb,"Andrew Garfield, Emma Stone, Rhys Ifans, Irrfa...",2012,136,7.0,262.03
417,Harry Potter and the Deathly Hallows: Part 1,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, B...",2010,146,7.7,294.98
472,Harry Potter and the Half-Blood Prince,"Adventure,Family,Fantasy",David Yates,"Daniel Radcliffe, Emma Watson, Rupert Grint, M...",2009,153,7.5,301.96
609,Beautiful Creatures,"Drama,Fantasy,Romance",Richard LaGravenese,"Alice Englert, Viola Davis, Emma Thompson,Alde...",2013,124,6.2,19.45


In [169]:
long_movies[long_movies["Title"].str.contains("Hunger")]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
151,The Hunger Games,"Adventure,Sci-Fi,Thriller",Gary Ross,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsw...",2012,142,7.2,408.0
407,The Hunger Games: Mockingjay - Part 2,"Action,Adventure,Sci-Fi",Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsw...",2015,137,6.6,281.67
577,The Hunger Games: Catching Fire,"Action,Adventure,Mystery",Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsw...",2013,146,7.6,424.65
679,The Hunger Games: Mockingjay - Part 1,"Action,Adventure,Sci-Fi",Francis Lawrence,"Jennifer Lawrence, Josh Hutcherson, Liam Hemsw...",2014,123,6.7,337.1


In [27]:
# short movies
short_movies = df[df["Runtime"] < df["Runtime"].mean() ]
short_movies

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
5,5,The Great Wall,"Action,Adventure,Fantasy",Yimou Zhang,"Matt Damon, Tian Jing, Willem Dafoe, Andy Lau",2016,103,6.1,45.13
7,7,Mindhorn,Comedy,Sean Foley,"Essie Davis, Andrea Riseborough, Julian Barrat...",2016,89,6.4,0
13,13,Moana,"Animation,Adventure,Comedy",Ron Clements,"Auli'i Cravalho, Dwayne Johnson, Rachel House,...",2016,107,7.7,248.75
14,14,Colossal,"Action,Comedy,Drama",Nacho Vigalondo,"Anne Hathaway, Jason Sudeikis, Austin Stowell,...",2016,109,6.4,2.87
...,...,...,...,...,...,...,...,...,...
993,993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0
994,994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0


In [29]:
highest_rating = short_movies["Rating"].max()
highest_rating

8.6

In [33]:
short_movies[short_movies["Rating"] == highest_rating]

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
96,96,Kimi no na wa,"Animation,Drama,Fantasy",Makoto Shinkai,"Ryûnosuke Kamiki, Mone Kamishiraishi, Ryô Nari...",2016,106,8.6,4.68
249,249,The Intouchables,"Biography,Comedy,Drama",Olivier Nakache,"François Cluzet, Omar Sy, Anne Le Ny, Audrey F...",2011,112,8.6,13.18


In [35]:
# how many movies are "Romance"? 
len(df[df["Genre"].str.contains("Romance")])

141

In [73]:
# which director had the highest average rating? 

# one way is to make a python dict of director, list of ratings
director_dict = dict()

# make the dictionary: key is director, value is list of ratings
for i in range(len(df)):
    director = df.loc[i, "Director"]
    rating = df.loc[i, "Rating"]
    #print(i, director, rating)
    if director not in director_dict:
        director_dict[director] = []
    director_dict[director].append(rating)

# make a ratings dict key is directory, value is average
# only include directors with > 4 movies
ratings_dict = {k:sum(v)/len(v) for (k,v) in director_dict.items() if len(v) > 4}

#sort a dict by values
dict(sorted(ratings_dict.items(), key=lambda t:t[-1], reverse=True))
    

{'Christopher Nolan': 8.680000000000001,
 'Martin Scorsese': 7.92,
 'David Fincher': 7.8199999999999985,
 'Denis Villeneuve': 7.76,
 'J.J. Abrams': 7.58,
 'David Yates': 7.433333333333334,
 'Danny Boyle': 7.42,
 'Antoine Fuqua': 7.040000000000001,
 'Zack Snyder': 7.040000000000001,
 'Woody Allen': 7.019999999999999,
 'Peter Berg': 6.860000000000001,
 'Ridley Scott': 6.85,
 'Justin Lin': 6.82,
 'Michael Bay': 6.483333333333334,
 'Paul W.S. Anderson': 5.766666666666666,
 'M. Night Shyamalan': 5.533333333333332}

In [94]:
#Question:  Can we clean the revenue data? 
#  all revenue is a string, some have "M" at the end
# we can just remove the M and convert it to a float

In [95]:
def format_revenue(revenue):
    #TODO: Check the last character of the string
    if revenue[-1] == 'M':
        return float(revenue[:-1])
    else:
        return float(revenue)

In [97]:
# let's test our format_revenue on the first 10 rows of data
for i in range(10):
    revenue = df.loc[i, "Revenue" ]
    if type(revenue[-1] == str):
        print(revenue, format_revenue(revenue))

333.13 333.13
126.46M 126.46
138.12M 138.12
270.32 270.32
325.02 325.02
45.13 45.13
151.06M 151.06
0 0.0
8.01 8.01
100.01M 100.01


In [99]:
# Is rating correlated with revenue?
rating = df["Rating"]
revenue = df["Revenue"].apply(format_revenue) # apply a function to a column
type(revenue)
print(rating.corr(revenue))

pandas.core.series.Series

## DataFrame.describe()
- provides a lot of useful stats
- works only for columns with numbers as values

In [159]:
stats = df.describe()
stats

Unnamed: 0,Year,Runtime,Rating
count,998.0,998.0,998.0
mean,2012.779559,113.170341,6.723447
std,3.207549,18.828877,0.945682
min,2006.0,66.0,1.9
25%,2010.0,100.0,6.2
50%,2014.0,111.0,6.8
75%,2016.0,123.0,7.4
max,2016.0,191.0,9.0


### How to get median runtime of all the movies?

In [161]:
stats.loc["50%", "Runtime"]

111.0

In [102]:
df.sort_values(["Rating", "Runtime"], ascending=False)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
54,54,The Dark Knight,"Action,Crime,Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart,Mi...",2008,152,9.0,533.32
117,117,Dangal,"Action,Biography,Drama",Nitesh Tiwari,"Aamir Khan, Sakshi Tanwar, Fatima Sana Shaikh,...",2016,161,8.8,11.15
80,80,Inception,"Action,Adventure,Sci-Fi",Christopher Nolan,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen...",2010,148,8.8,292.57
36,36,Interstellar,"Adventure,Drama,Sci-Fi",Christopher Nolan,"Matthew McConaughey, Anne Hathaway, Jessica Ch...",2014,169,8.6,187.99M
249,249,The Intouchables,"Biography,Comedy,Drama",Olivier Nakache,"François Cluzet, Omar Sy, Anne Le Ny, Audrey F...",2011,112,8.6,13.18
...,...,...,...,...,...,...,...,...,...
966,966,Wrecker,"Action,Horror,Thriller",Micheal Bafaro,"Anna Hutchison, Andrea Whitburn, Jennifer Koen...",2015,83,3.5,0
646,646,Tall Men,"Fantasy,Horror,Thriller",Jonathan Holbrook,"Dan Crisafulli, Kay Whitney, Richard Garcia, P...",2016,133,3.2,0
870,870,Dragonball Evolution,"Action,Adventure,Fantasy",James Wong,"Justin Chatwin, James Marsters, Yun-Fat Chow, ...",2009,85,2.7,9.35
42,42,Don't Fuck in the Woods,Horror,Shawn Burkett,"Brittany Blanton, Ayse Howard, Roman Jossart,N...",2016,73,2.7,0
