In [None]:
# Warmup -1
# Did you hardcode the slashes in P10 rather than using os.path.join?
# There's no quicker way to get a 0 on your project! It won't run on the grader.
# Check your code and check the autograder ASAP.

In [None]:
# Warmup 0: We'll need these modules!
import requests
import json
import pandas as pd
from pandas import Series, DataFrame

In [None]:
# Warmup 1: Read the data from "new_movie_data.csv" into a pandas DataFrame called "movies"


In [None]:
# Warmup 2: What years does this new movie dataset cover?


In [None]:
# Warmup 3a: What does this function do?
def format_revenue(revenue):
    if type(revenue) == float: # need this in here if we run code multiple times
        return revenue
    elif revenue[-1] == 'M': # some have an "M" at the end
        return float(revenue[:-1]) * 1e6
    else:                    # otherwise, assume millions.
        return float(revenue) * 1e6

In [None]:
# Warmup 3b: Using the above function, create a new column called
#            "CountableRevenue" with the revenue as a float.


In [None]:
# Warmup 4: What are the top 10 highest-revenue movies?


# Web 1 - How to get data from the 'Net

Core Ideas:
 - Network structure
     - Client/Server
     - Request/Response
 - HTTP protocol
     - URL
     - Headers
     - Status Codes
 - The requests module

## HTTP Status Codes overview
- 1XX : Informational
- 2XX : Successful
- 3XX : Redirection
- 4XX : Client Error
- 5XX : Server Error

https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

## requests.get : Simple string example
- URL: https://www.msyamkumar.com/hello.txt

In [None]:
url = "https://www.msyamkumar.com/hello.txt"
r = requests.get(url) # r is the response
print(r.status_code)
print(r.text)

In [None]:
# Q: What if the web site does not exist?
typo_url = "https://www.msyamkumar.com/hello.txttttt"
r = requests.get(typo_url)
print(r.status_code)
print(r.text)

# A: 

In [None]:
# We can check for a status_code error by using an assert
typo_url = "https://www.msyamkumar.com/hello.txttttt"
r = requests.get(typo_url)
assert r.status_code == 200
print(r.status_code)
print(r.text)


In [None]:
# Instead of using an assert, we often use raise_for_status()
r = requests.get(typo_url)
r.raise_for_status() #similar to asserting r.status_code == 200
r.text

# Note the error you get.... We will use this in the next cell

In [None]:
# Let's try to catch that error

try:

except:
    print("oops!!", e)
    

In [None]:
# we often need to prepend the names of exceptions with the name of the module
# fix the error from above

try:

except:
    print("oops!!", e)
    


## requests.get : JSON file example
- URL: https://www.msyamkumar.com/scores.json
- `json.load` (FILE_OBJECT)
- `json.loads` (STRING)

In [None]:
# GETting a JSON file, the long way
url = "https://www.msyamkumar.com/scores.json"
r = requests.get(url)
r.raise_for_status()
urltext = r.text
print(urltext)
d = json.loads(urltext)
print(type(d), d)

In [None]:
# GETting a JSON file, the shortcut way
url = "https://www.msyamkumar.com/scores.json"
#Shortcut to bypass using json.loads()
r = requests.get(url)
r.raise_for_status()
d2 = r.json()
print(type(d2), d2)

## Good GET Etiquette

Don't make a lot of requests to the same server all at once.
 - Requests use up the server's time
 - Major websites will often ban users who make too many requests
 - You can break a server....similar to DDoS attacks (DON'T DO THIS)
 
In CS220 we will usually give you a link to a copied file to avoid overloading the site.


## DEMO: Course Enrollment

Explore the API!

https://coletnelson.us/cs220-api/classes

https://coletnelson.us/cs220-api/classes_as_txt

https://coletnelson.us/cs220-api/classes/MATH_221

https://coletnelson.us/cs220-api/classes/COMPSCI_200

... etc

https://coletnelson.us/cs220-api/all_data

### Get the list of classes.

#### When the data is `json`

In [None]:
url = "https://coletnelson.us/cs220-api/classes"
r = requests.get(url)
r.raise_for_status()
classes_list = r.json()
print(type(classes_list))
print(classes_list)

#### When the data is `text`

In [None]:
url = "https://coletnelson.us/cs220-api/classes_as_txt"
r = requests.get(url)
r.raise_for_status()
classes_txt = r.text
print(type(classes_txt))
print(classes_txt)

In [None]:
classes_txt_as_list = ???

### Get data for a specific class

In [None]:
url = "https://coletnelson.us/cs220-api/classes/COMPSCI_200"
r = requests.get(url)
r.raise_for_status()
cs200_data = r.json()
print(type(cs200_data))
print(cs200_data) # Too much data? Try print(cs220_data.keys())

In [None]:
cs200_data.keys()

In [None]:
# Get the number of credits the course is worth


In [None]:
# Get the list of keywords for the course


In [None]:
# Get the official course name


In [None]:
# Get the number of sections offered.


In [None]:
# Collect all the class data in a list called 'all_class_data'
all_class_data = []
for class_num in classes_list:
    url = "https://coletnelson.us/cs220-api/classes/" + class_num
    r = requests.get(url)
    r.raise_for_status()
    class_data = r.json()
    all_class_data.append(???)

print(all_class_data) # Too much data? Try print(len(all_class_data))

In [None]:
print(len(all_class_data))

In [None]:
# Print the number of credits, course number, and name for each class.


In [None]:
# What is the average number of credits per course?


In [None]:
# What are the unique subjects?


In [None]:
# Besides PYSCH 202, what are the course numbers of the courses
# with the most sections offered (not including subsections)?
high_courses = []
high_sections = 0
for spec_class in all_class_data:
    pass
high_courses

### Can we make a Pandas dataframe? Yes!

In [None]:
all_course_frame = DataFrame(all_class_data)
all_course_frame

### We may want to do some "plumbing" with our data.

In [None]:
# Remove the 'sections' and 'requisites' column.
new_course_frame = all_course_frame.loc[:, "credits":"number"]
new_course_frame["subject"] = all_course_frame.loc[:, "subject"]
new_course_frame

In [None]:
# Turn 'keywords' into a series of Strings and remove the '[', ']', '''
new_course_frame["keywords"] = new_course_frame["keywords"].astype('string')
new_course_frame["keywords"] = new_course_frame["keywords"].str.replace("[", "", regex=False)
new_course_frame["keywords"] = new_course_frame["keywords"].str.replace("]", "", regex=False)
new_course_frame["keywords"] = new_course_frame["keywords"].str.replace("'", "", regex=False)
new_course_frame

### Pandas Operations

In [None]:
# What is the most number of credits a course offers?


In [None]:
# What is the least number of credits a course offers?


In [None]:
# What is the info for that course?


In [None]:
# What courses contain the keyword "programming"?


In [None]:
# What course has the most lengthy description?


### Write it out to a CSV file on your drive
You now have your own copy!

In [None]:
# Write it all out to a single CSV file
new_course_frame.to_csv("my_course_data.csv", index=False)

### Other Cool APIs

- City of Madison Transit: http://transitdata.cityofmadison.com/
- Reddit: https://reddit.com/r/UWMadison.json
- Lord of the Rings: https://the-one-api.dev/
- Pokemon: https://pokeapi.co/

Remember: Be judicious when making requests; don't overwhelm the server! :)

## Next Time
What other documents can we get via the Web? HTML is very popular! We'll explore this.