In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Data alignment review (element-wise operation: series op series)

In [2]:
import pandas as pd
from pandas import Series, DataFrame

In [3]:
x = Series({"A":10, "B":100})
s1 = Series({"A":2, "B":3})
s2 = Series({"B":3, "A":2})
print(x)
print(s1)
print(s2)

A     10
B    100
dtype: int64
A    2
B    3
dtype: int64
B    3
A    2
dtype: int64


## What is x * s1?

In [4]:
print(x)
print(s1)
x * s1

A     10
B    100
dtype: int64
A    2
B    3
dtype: int64


A     20
B    300
dtype: int64

## What is x * s2?

In [5]:
print(x)
print(s2)
x * s2

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A     20
B    300
dtype: int64

## What is x < s1?

In [6]:
print(x)
print(s1)
x < s1

A     10
B    100
dtype: int64
A    2
B    3
dtype: int64


A    False
B    False
dtype: bool

## What is x < s2?

In [7]:
print(x)
print(s2)
# x < s2 #ValueError because of data not being aligned!

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


## Oops, let's try series.lt(series)

In [8]:
print(x)
print(s2)
x.lt(s2) # lt is less than

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    False
B    False
dtype: bool

## How would you apply greater than?

In [9]:
print(x)
print(s2)
x.gt(s2) # greater than

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    True
B    True
dtype: bool

## What about equal comparison?

In [10]:
print(x)
print(s2)
x.eq(s2) # equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    False
B    False
dtype: bool

## ge (>=), le (<=), and ne (!=) are the other options

In [11]:
print(x)
print(s2)
x.ge(s2) # greater than or equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    True
B    True
dtype: bool

In [12]:
print(x)
print(s2)
x.le(s2) # less than or equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    False
B    False
dtype: bool

In [13]:
print(x)
print(s2)
x.ne(s2) # not equal

A     10
B    100
dtype: int64
B    3
A    2
dtype: int64


A    True
B    True
dtype: bool

## Can we fix the ordering of s2 series?
### Try series.sort_index()

In [14]:
print(s2)
s2_sorted = s2.sort_index()
s2_sorted

B    3
A    2
dtype: int64


A    2
B    3
dtype: int64

## DataFrame can be created from:
1. dict of Series
2. dict of lists
3. list of lists
4. dict of dicts
5. list of dicts

### DataFrame from dictionary of Series

In [15]:
col1 = Series(["Alice", "Bob", "Cindy", "Dan"])
col2 = Series([6, 7, 8, 9])
DataFrame({
    "Player name": col1,
    "Score": col2
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from dictionary of lists

In [16]:
DataFrame({
    "Player name": ["Alice", "Bob", "Cindy", "Dan"],
    "Score": [6, 7, 8, 9]
})

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from list of lists

In [17]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data)

Unnamed: 0,0,1
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### DataFrame from dictionary of dicts

In [18]:
data = {
    "Name": {0: "Alice", 1: "Bob", 2: "Cindy", 3: "Dan"},
    "Score": {0: 6, 1: 7, 2: 8, 3: 9}
}

DataFrame(data)

Unnamed: 0,Name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


In [19]:
data = {
    "Name": {"A": "Alice", "B": "Bob", "C": "Cindy", "D": "Dan"},
    "Score": {"A": 6, "B": 7, "C": 8, "D": 9}
}

DataFrame(data)

Unnamed: 0,Name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


### DataFrame from list of dicts

In [20]:
data = [
    {"Player name":"Alice", "Score":6},
    {"Player name":"Bob", "Score":7},
    {"Player name":"Cindy", "Score":8},
    {"Player name":"Dan", "Score":9}
]
DataFrame(data)

Unnamed: 0,Player name,Score
0,Alice,6
1,Bob,7
2,Cindy,8
3,Dan,9


### Renaming the row index

In [21]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data, index = ["A","B","C","D"], columns = ["Player name", "Score"])

Unnamed: 0,Player name,Score
A,Alice,6
B,Bob,7
C,Cindy,8
D,Dan,9


## Data lookup

### Series
- s.loc[X]   <- lookup by pandas index
- s.iloc[X]  <- lookup by integer position
- s[X]       <- depends (first try index, use integer position if necessary)

In [22]:
col1 = Series({"Alice":6, "Bob":7, "Cindy":8, "Dan":9})
col1

Alice    6
Bob      7
Cindy    8
Dan      9
dtype: int64

In [23]:
col1.loc["Bob"] #Series index

7

In [24]:
col1.iloc[2] #Series integer position

8

In [25]:
col1["Cindy"] #Series index

8

In [26]:
col1[1] #Series integer position 
#No conflict between index and integer position in this example!

7

## Data lookup

### Series
- s.loc[X]   <- lookup by pandas index
- s.iloc[X]  <- lookup by integer position
- s[X]       <- depends (first try index, use integer position if necessary)

### DataFrame
- d.loc[X]    <- lookup ROW by pandas ROW index
- d.iloc[X]   <- lookup ROW by ROW integer position
- d[X]        <- lookup COL by COL index
- d.loc[X, Y] <- lookup by ROW index and COL index
- d.iloc[X, Y] <- lookup by ROW integer position and COL integer position

In [27]:
df

NameError: name 'df' is not defined

In [None]:
df.loc["A"] #ROW INDEX

In [None]:
type(df.loc["A"])

In [None]:
df.iloc[1] #ROW INTEGER POSITION

In [None]:
df.iloc[-1] #ROW INTEGER POSITION

In [None]:
df["Scores"] #COLUMN INDEX

In [None]:
#df["A"]  #KeyError

In [None]:
df.loc["B", "Player name"] # ROW INDEX, COL INDEX

In [None]:
df.loc["B", "Scores"] # ROW INDEX, COL INDEX

In [None]:
df.iloc[-1, 0] # ROW Integer position, COL Integer position

## How to set values for a specific entry?

- d.loc[X, Y] = new_val <- set value by ROW INDEX and COL INDEX
- d.iloc[X, Y] = new_val <- set value by ROW Integer position and COL Integer position

In [None]:
df.loc["B","Scores"] = 12
df

In [None]:
df.loc["B","Scores"] += 3
df

In [None]:
df.iloc[-1,1] += 2
df

## How to compute max score?

In [None]:
df["Scores"].max()

## How to compute mean score?

In [None]:
df["Scores"].mean()

## Slicing DataFrame

- df.iloc[ROW_SLICE, COL_SLICE] <- take a rectangular slice from the DataFrame using integer positions
- df.loc[ROW_SLICE, COL_SLICE] <- take a rectangular slice from the DataFrame using index

In [None]:
df.iloc[1:3, 1:]

In [None]:
df.iloc[1:3, :]

In [None]:
df.loc["B":"C", :]

## How to set values for sliced DataFrame?

- d.loc[ROW_SLICE, COL_SLICE] = new_val <- set value by ROW INDEX and COL INDEX
- d.iloc[ROW_SLICE, COL_SLICE] = new_val <- set value by ROW Integer position and COL Integer position

In [None]:
df

In [None]:
df.loc["B":"C", "Scores"] += 5
df

## Instead of a slice, you could use a list of indexes or integer positions.

In [None]:
df.loc[["B", "D"],"Player name"]

In [None]:
df.loc[["B", "D"],"Scores"] += 2

## Boolean indexing

### Series
- s[BOOL SERIES]  <- gets all s values lined up with True

### DataFrame
- d[BOOL SERIES]  <- pulls out rows lined up with True

In [None]:
df

In [None]:
df["Scores"]

In [None]:
b = df["Scores"] >= 15
b

In [None]:
df[b]

## Creating DataFrame from csv

In [None]:
df = pd.read_csv("IMDB-Movie-Data.csv")
df

## How to see first few lines of the DataFrame?

In [None]:
df.head()

In [None]:
df.head(2)

## How to see last few lines of the DataFrame?

In [None]:
df.tail()

In [None]:
df.tail(3)

In [None]:
df["Year"]

In [None]:
df["Rating"]

## Notice that there are two index columns
- That happened because when you write a csv from pandas to a file, it writes a new index column
- So if the dataFrame already contains an index, you are going to get two index columns
- Let's fix that problem

### How can you use slicing to get rid of the first column?

In [None]:
better = df.iloc[:, 1:]
better.head()

### Wrong way to write a df to a csv file

In [None]:
better.to_csv("wrong_movies.csv")

### Correct way to write a df to a csv file

In [None]:
better.to_csv("better_movies.csv", index = False)

## What is the highest rated movie that had an above average runtime?

In [None]:
df = better
df["Runtime"]

In [None]:
df["Runtime"].mean()

In [None]:
b = df["Runtime"] > df["Runtime"].mean()
b

In [None]:
long_movies = df[b]
long_movies.head()

In [None]:
long_movies["Rating"]

In [None]:
long_movies["Rating"].max()

In [None]:
b = long_movies["Rating"] == long_movies["Rating"].max()
b

In [None]:
long_movies[long_movies["Rating"] == long_movies["Rating"].max()]

## DataFrame.describe()
- provides a lot of useful stats
- works only for columns with numbers as values

In [None]:
stats = df.describe()
stats

### How to get median runtime of all the movies?

In [None]:
stats.loc["50%", "Runtime"]