In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Data alignment review (element-wise operation: series op series)

In [2]:
import pandas as pd
from pandas import Series, DataFrame

In [3]:
x = Series({"A":10, "B":100})
s1 = Series({"A":2, "B":3})
s2 = Series({"B":3, "A":2})
print(x)
print(s1)
print(s2)

A     10
B    100
dtype: int64
A    2
B    3
dtype: int64
B    3
A    2
dtype: int64


## What is x * s1?

In [4]:
print(x)
print(s1)
x * s1

A     10
B    100
dtype: int64
A    2
B    3
dtype: int64


A     20
B    300
dtype: int64

## What is x * s2?

In [5]:
print(x)
print(s2)
x * s2

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A     20
B    300
dtype: int64

## What is x < s1?

In [6]:
print(x)
print(s1)
x < s1

A     10
B    100
dtype: int64
A    2
B    3
dtype: int64


A    False
B    False
dtype: bool

## What is x < s2?

In [7]:
print(x)
print(s2)
# x < s2 #ValueError because of data not being aligned!

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


## Oops, let's try series.lt(series)

In [8]:
print(x)
print(s2)
x.lt(s2) # lt is less than

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    False
B    False
dtype: bool

## How would you apply greater than?

In [9]:
print(x)
print(s2)
x.gt(s2) # greater than

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    True
B    True
dtype: bool

## What about equal comparison?

In [10]:
print(x)
print(s2)
x.eq(s2) # equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    False
B    False
dtype: bool

## ge (>=), le (<=), and ne (!=) are the other options

In [11]:
print(x)
print(s2)
x.ge(s2) # greater than or equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    True
B    True
dtype: bool

In [12]:
print(x)
print(s2)
x.le(s2) # less than or equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    False
B    False
dtype: bool

In [13]:
print(x)
print(s2)
x.ne(s2) # not equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    True
B    True
dtype: bool

## Can we fix the ordering of s2 series?
### Try series.sort_index()

In [14]:
print(s2)
s2_sorted = s2.sort_index()
s2_sorted

B    3
A    2
dtype: int64


A    2
B    3
dtype: int64

## DataFrame can be created from:
1. dict of Series
2. dict of lists
3. list of lists
4. dict of dicts
5. list of dicts

### DataFrame from dictionary of Series

In [15]:
col1 = Series(["Alice", "Bob", "Cindy", "Dan"])
col2 = Series([6, 7, 8, 9])
DataFrame({
    "Player name": col1,
    "Score": col2
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from dictionary of lists

In [16]:
DataFrame({
    "Name": ["Alice", "Bob", "Cindy", "Dan"],
    "Score": [6, 7, 8, 9]
})

Unnamed: 0,Name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from list of lists

In [17]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data)

Unnamed: 0,0,1
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from dictionary of dicts

In [18]:
data = {
    "Name": {0: "Alice", 1: "Bob", 2: "Cindy", 3: "Dan"},
    "Score": {0: 6, 1: 7, 2: 8, 3: 9}
}

DataFrame(data)

Unnamed: 0,Name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


In [19]:
data = {
    "Name": {"A": "Alice", "B": "Bob", "C": "Cindy", "D": "Dan"},
    "Score": {"A": 6, "B": 7, "C": 8, "D": 9}
}

DataFrame(data)

Unnamed: 0,Name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


### DataFrame from list of dicts

In [20]:
data = [
    {"Player name":"Alice", "Scores":6},
    {"Player name":"Bob", "Scores":7},
    {"Player name":"Cindy", "Scores":8},
    {"Player name":"Dan", "Scores":9}
]
DataFrame(data)

Unnamed: 0,Player name,Scores
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### Renaming the row index

In [21]:
df = DataFrame([
    {"Player name":"Alice", "Scores":6},
    {"Player name":"Bob", "Scores":7},
    {"Player name":"Cindy", "Scores":8},
    {"Player name":"Dan", "Scores":9}
    ],
    index = ["A","B","C","D"]
)
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


## Data lookup

### Series
- s.loc[X]   <- lookup by pandas index
- s.iloc[X]  <- lookup by integer position
- s[X]       <- depends (first try index, use integer position if necessary)

In [22]:
col1 = Series({"Alice":6, "Bob":7, "Cindy":8, "Dan":9})
col1

Alice    6
Bob      7
Cindy    8
Dan      9
dtype: int64

In [23]:
col1.loc["Bob"] #Series index

7

In [24]:
col1.iloc[2] #Series integer position

8

In [25]:
col1["Cindy"] #Series index

8

In [26]:
col1[1] #Series integer position 
#No conflict between index and integer position in this example!

7

## Data lookup

### Series
- s.loc[X]   <- lookup by pandas index
- s.iloc[X]  <- lookup by integer position
- s[X]       <- depends (first try index, use integer position if necessary)

### DataFrame
- d.loc[X]    <- lookup ROW by pandas ROW index
- d.iloc[X]   <- lookup ROW by ROW integer position
- d[X]        <- lookup COL by COL index
- d.loc[X, Y] <- lookup by ROW index and COL index
- d.iloc[X, Y] <- lookup by ROW integer position and COL integer position

In [27]:
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


In [28]:
df.loc["A"] #ROW INDEX

Player name    Alice
Scores             6
Name: A, dtype: object

In [29]:
type(df.loc["A"])

pandas.core.series.Series

In [30]:
df.iloc[1] #ROW INTEGER POSITION

Player name    Bob
Scores           7
Name: B, dtype: object

In [31]:
df.iloc[-1] #ROW INTEGER POSITION

Player name    Dan
Scores           9
Name: D, dtype: object

In [33]:
df["Scores"] #COLUMN INDEX

A    6
B    7
C    8
D    9
Name: Scores, dtype: int64

In [34]:
#df["A"]  #KeyError

In [35]:
df.loc["B", "Player name"] # ROW INDEX, COL INDEX

'Bob'

In [36]:
df.loc["B", "Scores"] # ROW INDEX, COL INDEX

7

In [37]:
df.iloc[-1, 0] # ROW Integer position, COL Integer position

'Dan'

## How to set values for a specific entry?

- d.loc[X, Y] = new_val <- set value by ROW INDEX and COL INDEX
- d.iloc[X, Y] = new_val <- set value by ROW Integer position and COL Integer position

In [38]:
df.loc["B","Scores"] = 12
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,12
C,Cindy,8
D,Dan,9


In [39]:
df.loc["B","Scores"] += 3
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,15
C,Cindy,8
D,Dan,9


In [40]:
df.iloc[-1,1] += 2
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,15
C,Cindy,8
D,Dan,11


## How to compute max score?

In [41]:
df["Scores"].max()

15

## How to compute mean score?

In [42]:
df["Scores"].mean()

10.0

## Slicing DataFrame

- df.iloc[ROW_SLICE, COL_SLICE] <- take a rectangular slice from the DataFrame using integer positions
- df.loc[ROW_SLICE, COL_SLICE] <- take a rectangular slice from the DataFrame using index

In [43]:
df.iloc[1:3, 1:]

Unnamed: 0,Scores
B,15
C,8


In [44]:
df.iloc[1:3, :]

Unnamed: 0,Player name,Scores
B,Bob,15
C,Cindy,8


In [45]:
df.loc["B":"C", :]

Unnamed: 0,Player name,Scores
B,Bob,15
C,Cindy,8


## How to set values for sliced DataFrame?

- d.loc[ROW_SLICE, COL_SLICE] = new_val <- set value by ROW INDEX and COL INDEX
- d.iloc[ROW_SLICE, COL_SLICE] = new_val <- set value by ROW Integer position and COL Integer position

In [46]:
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,15
C,Cindy,8
D,Dan,11


In [47]:
df.loc["B":"C", "Scores"] += 5
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,20
C,Cindy,13
D,Dan,11


## Instead of a slice, you could use a list of indexes or integer positions.

In [48]:
df.loc[["B", "D"],"Player name"]

B    Bob
D    Dan
Name: Player name, dtype: object

In [49]:
df.loc[["B", "D"],"Scores"] += 2

## Boolean indexing

### Series
- s[BOOL SERIES]  <- gets all s values lined up with True

### DataFrame
- d[BOOL SERIES]  <- pulls out rows lined up with True

In [50]:
df

Unnamed: 0,Player name,Scores
A,Alice,6
B,Bob,22
C,Cindy,13
D,Dan,13


In [51]:
df["Scores"]

A     6
B    22
C    13
D    13
Name: Scores, dtype: int64

In [52]:
b = df["Scores"] >= 15
b

A    False
B     True
C    False
D    False
Name: Scores, dtype: bool

In [53]:
df[b]

Unnamed: 0,Player name,Scores
B,Bob,22


## Creating DataFrame from csv

In [54]:
df = pd.read_csv("IMDB-Movie-Data.csv")
df

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
...,...,...,...,...,...,...,...,...,...
993,993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0
994,994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0


## How to see first few lines of the DataFrame?

In [55]:
df.head()

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02


In [56]:
df.head(2)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M


## How to see last few lines of the DataFrame?

In [57]:
df.tail()

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
993,993,Secret in Their Eyes,"Crime,Drama,Mystery",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,0.0
994,994,Hostel: Part II,Horror,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,17.54
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0.0
997,997,Nine Lives,"Comedy,Family,Fantasy",Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,19.64


In [58]:
df.tail(3)

Unnamed: 0,Index,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
995,995,Step Up 2: The Streets,"Drama,Music,Romance",Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,58.01
996,996,Search Party,"Adventure,Comedy",Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,0.0
997,997,Nine Lives,"Comedy,Family,Fantasy",Barry Sonnenfeld,"Kevin Spacey, Jennifer Garner, Robbie Amell,Ch...",2016,87,5.3,19.64


In [59]:
df["Year"]

0      2014
1      2012
2      2016
3      2016
4      2016
       ... 
993    2015
994    2007
995    2008
996    2014
997    2016
Name: Year, Length: 998, dtype: int64

In [60]:
df["Rating"]

0      8.1
1      7.0
2      7.3
3      7.2
4      6.2
      ... 
993    6.2
994    5.5
995    6.2
996    5.6
997    5.3
Name: Rating, Length: 998, dtype: float64

## Notice that there are two index columns
- That happened because when you write a csv from pandas to a file, it writes a new index column
- So if the dataFrame already contains an index, you are going to get two index columns
- Let's fix that problem

### How can you use slicing to get rid of the first column?

In [61]:
better = df.iloc[:, 1:]
better.head()

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
3,Sing,"Animation,Comedy,Family",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,270.32
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02


### Wrong way to write a df to a csv file

In [62]:
better.to_csv("wrong_movies.csv")

### Correct way to write a df to a csv file

In [63]:
better.to_csv("better_movies.csv", index = False)

## What is the highest rated movie that had an above average runtime?

In [64]:
df = better
df["Runtime"]

0      121
1      124
2      117
3      108
4      123
      ... 
993    111
994     94
995     98
996     93
997     87
Name: Runtime, Length: 998, dtype: int64

In [65]:
df["Runtime"].mean()

113.17034068136273

In [66]:
b = df["Runtime"] > df["Runtime"].mean()
b

0       True
1       True
2       True
3      False
4       True
       ...  
993    False
994    False
995    False
996    False
997    False
Name: Runtime, Length: 998, dtype: bool

In [67]:
long_movies = df[b]
long_movies.head()

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,333.13
1,Prometheus,"Adventure,Mystery,Sci-Fi",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael ...",2012,124,7.0,126.46M
2,Split,"Horror,Thriller",M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,138.12M
4,Suicide Squad,"Action,Adventure,Fantasy",David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,325.02
6,La La Land,"Comedy,Drama,Music",Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,151.06M


In [68]:
long_movies["Rating"]

0      8.1
1      7.0
2      7.3
4      6.2
6      8.3
      ... 
977    7.6
979    5.3
980    7.4
987    7.5
989    8.5
Name: Rating, Length: 432, dtype: float64

In [69]:
long_movies["Rating"].max()

9.0

In [70]:
b = long_movies["Rating"] == long_movies["Rating"].max()
b

0      False
1      False
2      False
4      False
6      False
       ...  
977    False
979    False
980    False
987    False
989    False
Name: Rating, Length: 432, dtype: bool

In [71]:
long_movies[long_movies["Rating"] == long_movies["Rating"].max()]

Unnamed: 0,Title,Genre,Director,Cast,Year,Runtime,Rating,Revenue
54,The Dark Knight,"Action,Crime,Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart,Mi...",2008,152,9.0,533.32


## DataFrame.describe()
- provides a lot of useful stats
- works only for columns with numbers as values

In [72]:
stats = df.describe()
stats

Unnamed: 0,Year,Runtime,Rating
count,998.0,998.0,998.0
mean,2012.779559,113.170341,6.723447
std,3.207549,18.828877,0.945682
min,2006.0,66.0,1.9
25%,2010.0,100.0,6.2
50%,2014.0,111.0,6.8
75%,2016.0,123.0,7.4
max,2016.0,191.0,9.0


### How to get median runtime of all the movies?

In [73]:
stats.loc["50%", "Runtime"]

111.0