In [None]:
import pandas as pd
from pandas import Series, DataFrame
# We can explictly import Series and DataFrame, why might we do this?

###  Series Review


#### Series from `list`

In [None]:
scores_list = [54, 22, 19, 73, 80]
scores_series = Series(scores_list)
scores_series

# what name do we call the  0, 1, 2, ... ??       A:
# what name do we call the  54, 22, 19, .... ??   A:

#### Selecting certain scores.
What are all the scores `> 50`?

**Answer:** Boolean indexing. Try the following...

In [None]:
scores_series[[True, True, False, False, True]] # often called a "mask"

We are really writing a "mask" for our data.

In [None]:
scores_series > 50

#### Series from `dict`

In [None]:
# Imagine we hire students and track their weekly hours
week1 = Series({"Rita":5, "Therese":3, "Janice": 6})
week2 = Series({"Rita":3, "Therese":7, "Janice": 4})
week3 = Series({"Therese":5, "Janice":5, "Rita": 8}) # Wrong order! Will this matter?
print(week1)
print(week2)
print(week3)

####  For everyone in Week 1, add 3 to their hours 

In [None]:
week1 = week1 + 3
week1

#### Total up everyone's hours

In [None]:
total_hours = week1 + week2 + week3
total_hours

#### What is week1 / week3 ?

In [None]:
week1 / week3
# Notice that we didn't have to worry about the order of indices

#### What type of values are stored in  week1 > week2?

In [None]:
print(week1)
print(week2)
week1 > week2 # indices are ordered the same

####  What is week1 > week3?

In [None]:
print(week1)
print(week3)
# week1 > week3 # indices not in same order
# week1.sort_index() > week3.sort_index() #proper way


# Lecture 28:  Pandas 2 - DataFrames


Learning Objectives:
- Create a DataFrame from 
 - a dictionary of Series, lists, or dicts
 - a list of Series, lists, dicts
- Select a column, row, cell, or rectangular region of a DataFrame
- Convert CSV files into DataFrames and DataFrames into CSV Files
- Access the head or tail of a DataFrame

**Big Idea**: Data Frames store 2-dimensional data in tables! It is a collection of Series.

## You can create a DataFrame in a variety of ways!
### From a dictionary of Series

In [None]:
names = Series(["Alice", "Bob", "Cindy", "Dan"])
scores = Series([6, 7, 8, 9])

# to make a dictionary of Series, need to write column names for the keys
DataFrame({
    "Player name": names,
    "Score": scores
})

### From a dictionary of lists

In [None]:
name_list = ["Alice", "Bob", "Cindy", "Dan"]
score_list = [6, 7, 8, 9]
# this is the same as above, reminding us that Series act like lists
DataFrame({
    "Player name": name_list,
    "Score": score_list
})

### From a dictionary of dictionaries
We need to make up keys to match the things in each column

In [None]:
data = {
    "Player name": {0: "Alice", 1: "Bob", 2: "Cindy", 3: "Dan"},
    "Score": {0: 6, 1: 7, 2: 8, 3: 9}
}
DataFrame(data)

### From a list of lists
We have to add the column names, we do this with `columns = [name1, name2, ....]` 

In [None]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
data
DataFrame(data, columns = ["Player name", "Score"])

### From a list of dicts

In [None]:
data = [
    {"Player name": "Alice", "Score": 6},
    {"Player name": "Bob", "Score": 7},
    {"Player name": "Cindy", "Score": 8},
    {"Player name": "Dan", "Score": 9}
]
data
DataFrame(data)

### Explicitly naming the indices
We can use `index = [name1, name2, ...]` to rename the index of each row

In [None]:
# 
data = [
    {"Player name": "Alice", "Score": 6},
    {"Player name": "Bob", "Score": 7},
    {"Player name": "Cindy", "Score": 8},
    {"Player name": "Dan", "Score": 9}
]
data
DataFrame(data, index=["A", "B", "C", "D"]) # must have a name for each row

### Explicitly naming the columns

In [None]:
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
DataFrame(data, columns=["Player name", "Score"])


In [None]:
# You try: 
# Make a DataFrame of 4 people you know with different ages
# Give names to both the columns and rows

# Share how you did with this with your neighbor
# If you both did it the same way, try it a different way.

## Select a column, row, cell, or rectangular region of a DataFrame
### Data lookup: Series
- `s.loc[X]`   <- lookup by pandas index
- `s.iloc[X]`  <- lookup by integer position

In [None]:
hours = Series({"Alice":6, "Bob":7, "Cindy":8, "Dan":9})
hours

In [None]:
# Lookup Bob's hours by pandas index.
hours.loc["Bob"]

In [None]:
# Lookup Bob's hours by integer position.
hours.iloc[2]

In [None]:
# Lookup Cindy's hours by pandas index.


###  Data lookup: DataFrame


- `d.loc[r]`     lookup ROW by pandas ROW index
- `d.iloc[r]`    lookup ROW by ROW integer position
- `d[c]`         lookup COL by pandas COL index
- `d.loc[r, c]`  lookup by pandas ROW index and pandas COL index
- `d.iloc[r, c]`  lookup by ROW integer position and COL integer position

In [None]:
# We often call the object that we make df
data = [
    ["Alice", 6],
    ["Bob", 7],
    ["Cindy", 8],
    ["Dan", 9]
]
df = DataFrame(data, index=["A", "B", "C", "D"], columns = ["Player name", "Score"])
df

### What are 3 different ways of accessing row D? 

### How about accessing a column?

### What are 3 different ways to access a single cell?

## How to set values for a specific entry?

- `d.loc[r, c] = new_val`
- `d.iloc[r, c] = new_val`

In [None]:
#change player D's name
df.loc["D", "Player name"] = "Bianca"
df

In [None]:
# then add 3 to that player's score using .loc


In [None]:
# add 7 to a different player's score using .iloc


### Find the max score and the mean score

In [None]:
# find the max and mean of the "Score" column
print(df["Score"].max(), df["Score"].mean())

### Find the highest scoring player

##  Slicing a DataFrame

- `df.iloc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using integer positions
- `df.loc[ROW_SLICE, COL_SLICE]` <- make a rectangular slice from the DataFrame using index

In [None]:
df.iloc[1:3, 0:2]

In [None]:
df.loc["B":"C", "Player name":"Score"] # notice that this way is inclusive of endpoints

## Set values for sliced DataFrame

- `d.loc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW INDEX and COL INDEX
- `d.iloc[ROW_SLICE, COL_SLICE] = new_val` <- set value by ROW Integer position and COL Integer position

In [None]:
df

In [None]:
df.loc["B":"C", "Score"] += 5
df

### Pandas allows slicing of non-contiguous columns

In [None]:
# just get Player name for Index B and D
df.loc[["B", "D"],"Player name"]

In [None]:
# add 2 to the people in rows B and D
df.loc[["B", "D"],"Score"] += 2
df

## Boolean indexing on a DataFrame

- `d[BOOL SERIES]`  <- makes a new DF of all rows that lined up were True

In [None]:
df

### Make a Series of Booleans based on Score >= 15

In [None]:
b = df["Score"] >= 15
b

### use b to slice the DataFrame
if b is true, include this row in the new df

In [None]:
df[b]

### do the last two things in a single step

In [None]:
df[df["Score"] >= 15]

## Creating DataFrame from csv

In [None]:
# it's that easy!  
df = pd.read_csv("IMDB-Movie-Data.csv")
df

###   View the first few lines of the DataFrame
- `.head(n)` gets the first n lines, 5 is the default

In [None]:
df.head()

### get the first 2 rows

In [None]:
df.head(2)

###   View the first few lines of the DataFrame
- `.tail(n)` gets the last n lines, 5 is the default

In [None]:
df.tail()

In [None]:
df.tail(3)

### What is the last year in our DataFrame?

In [None]:
df["Year"].max()

### What are the rows that correspond to movies whose title contains "Harry" ? 


### What is the movie at index 6 ? 

## Notice that there are two index columns
- That happened because when you write a csv from pandas to a file, it writes a new index column
- So if the dataFrame already contains an index, you are going to get two index columns
- Let's fix that problem

### How can you use slicing to get just columns with Title and Year?

In [None]:
df2 = ???
df2
# notice that this does not have the 'index' column

### How can you use slicing to get rid of the first column?

In [None]:
df = df.iloc[:, 1:] #all the rows, not column 0
df

### Write a df to a csv file

In [None]:
df.to_csv("better_movies.csv", index = False)

## Practice on your own.....Data Analysis with Data Frames


In [None]:
# What are all the movies that have above average run time? 
long_movies = ???
long_movies

In [None]:
# of these movies, what was the min rating? 
min_rating = ???
min_rating

In [None]:
# Which movies had this min rating?
???

### What are all long_movies with someone in the cast named "Emma" ? 

In [None]:
???

### What is the title of the shortest movie?

In [None]:
???

### What movie had the highest revenue?

In [None]:
# df["Revnue"].max() did not work
# we need to clean our data

def format_revenue(revenue):
    #TODO: Check the last character of the string
    if type(revenue) == float: # need this in here if we run code multiple times
        return revenue
    elif revenue[-1] == 'M': # some have an "M" at the end
        return float(revenue[:-1]) * 1e6
    else:
        return float(revenue) * 1e6

In [None]:
# What movie had the highest revenue?
revenue = df["Revenue"].apply(format_revenue) # apply a function to a column
print(revenue.head())
max_revenue = revenue.max()

# make a copy of our df
rev_df = df.copy()
rev_df["Rev as fl"] = revenue
rev_df

In [None]:
# Now we can answer the question!
???

In [None]:
# Or more generally...
rev_df.sort_values(by="Rev as fl", ascending=False)

In [None]:
df

In [None]:
# What is the average runtime for movies by "Francis Lawrence"?
???

### More complicated questions...

In [None]:
# Which director had the highest average rating? 

# one way is to make a python dict of director, list of ratings
director_dict = dict()

# make the dictionary: key is director, value is list of ratings
for i in range(len(df)):
    director = df.loc[i, "Director"]
    rating = df.loc[i, "Rating"]
    #print(i, director, rating)
    if director not in director_dict:
        director_dict[director] = []
    director_dict[director].append(rating)

# make a ratings dict key is directory, value is average
# only include directors with > 4 movies
ratings_dict = {k:sum(v)/len(v) for (k,v) in director_dict.items() if len(v) > 4}

#sort a dict by values
dict(sorted(ratings_dict.items(), key=lambda t:t[-1], reverse=True))
    

In [None]:
# FOR DEMONSTRATION PURPOSES ONLY
# We haven't (and will not) learn about "groupby"
# Pandas has many operations which will be helpful!

# Consider what you already know, and what Pandas can solve
# when formulating your solutions.
rating_groups = df.groupby("Director")["Rating"]
rating_groups.mean()[rating_groups.count() > 4].sort_values(ascending=False)