# Module 9

## Creating Web APIs with Flask

In [1]:
import requests
import json

In [58]:
### Sending a POST request

# Set the API endpoint URL
url = 'http://localhost:8001/api/scrape_wikipedia'

# Set the request headers
headers = {
    'Content-Type': 'application/json'
}

# Set the request data
data = {
    "topic": "Data_engineering"
}

# Send the POST request to the API endpoint
response = requests.post(url, headers=headers, json=data)
# curl -X POST -H "Content-Type: application/json" -d '{"Name": "John Doe","Age": 35,"City": "New York"}' http://localhost:8001/api/add_data

# Print the response status code and content
print('Response Status Code:', response.status_code)
print('Response Content:', json.loads(response.content))

Response Status Code: 200
Response Content: {'content': 'Data engineering refers to the building of systems to enable the collection and usage of data. This ', 'num_links': 362, 'title': 'Data engineering - Wikipedia'}


In [38]:
# Set the API endpoint URL
url = 'http://localhost:8001/api/delete_topic'

# Set the request headers
headers = {
    'Content-Type': 'application/json'
}

# Set the request data
data = {
    "topic": "Data_engineering"
}

# Send the POST request to the API endpoint
response = requests.delete(url, headers=headers, json=data)
# curl -X POST -H "Content-Type: application/json" -d '{"Name": "John Doe","Age": 35,"City": "New York"}' http://localhost:8001/api/add_data

# Print the response status code and content
print('Response Status Code:', response.status_code)
print('Response Content:', json.loads(response.content))

Response Status Code: 404
Response Content: {'error': 'Topic not found'}


In [59]:
# Set the API endpoint URL
url = 'http://localhost:8001/api/add_data'

# Set the request headers
headers = {
    'Content-Type': 'application/json'
}

# Send the POST request to the API endpoint
response = requests.post(url, headers=headers)
# curl -X POST -H "Content-Type: application/json" -d '{"Name": "John Doe","Age": 35,"City": "New York"}' http://localhost:8001/api/add_data

# Print the response status code and content
print('Response Status Code:', response.status_code)
print('Response Content:', json.loads(response.content))

Response Status Code: 200
Response Content: {'message': 'Data loaded successfully'}


In [60]:
### Sending a GET request

# Set the API endpoint URL
url = 'http://localhost:8001/api/get_data'

# Send the GET request to the API endpoint
response = requests.get(url)

# Print the response status code and content
print('Response Status Code:', response.status_code)
print(json.loads(response.content))

Response Status Code: 200
{'Data_engineering': {'content': 'Data engineering refers to the building of systems to enable the collection and usage of data. This ', 'num_links': 362, 'title': 'Data engineering - Wikipedia'}}


## Web Scraping with Beautiful Soup 

In [5]:
import requests
from bs4 import BeautifulSoup

# Send a GET request to the website
url = 'https://en.wikipedia.org/wiki/Web_scraping'
response = requests.get(url)

In [6]:
response.status_code

200

In [7]:
response.content[0:500]

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-'

In [8]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the page title
title = soup.find('title').text
print(f"Page title: {title}")

# Find all the hyperlinks on the page
links = []
for link in soup.find_all('a'):
    links.append(link.get('href'))
print(f"Number of links on the page: {len(links)}")

Page title: Web scraping - Wikipedia
Number of links on the page: 420


In [13]:
soup.find('p').text.strip()[0:100]

'Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from '

## Web Scraping with Selenium
Note: This will not work in our Docker enviorment, but feel free to play around with it on your local machine

In [None]:
#!pip install selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from time import sleep

# initialize Chrome webdriver
driver = webdriver.Chrome()

# navigate to Yelp homepage
driver.get("https://www.yelp.com/")

# find search bar element and input search query
search_bar = driver.find_element(By.ID, "search_description")
search_bar.clear()
search_bar.send_keys("tacos")
search_bar.send_keys(Keys.RETURN)

# wait for search results to load
sleep(5)

# find all search result elements and loop through them
results = driver.find_elements(By.CLASS_NAME, "") # <- Put in the appropriate class name
for result in results:
    print(result.text)

# close webdriver
driver.quit()

## PostgreSQL Connection

In [None]:
# pip install psycopg2

In [16]:
import psycopg2

In [18]:
# https://www.psycopg.org/docs/usage.html

## Connect to Database
conn = psycopg2.connect(
    host="localhost", # "localhost"
    port=5432,
    database="jhu",
    user="jhu",
    password="jhu123")

## Create a cursor object to interface with psql
cur = conn.cursor()

cur.execute(
    """
    CREATE SCHEMA IF NOT EXISTS test
    """)

cur.execute(
    """
    CREATE TABLE IF NOT EXISTS test.conn_test (
                id SERIAL PRIMARY KEY,
                col1 VARCHAR(25) NOT NULL,
                col2 VARCHAR(100) NOT NULL,
                col3 INTEGER)
    """)

## Inserting data 
## NOTE: Use bound variables, never string formatting to prevent SQL Injection
query = "INSERT INTO test.conn_test (col1, col2, col3) VALUES (%s, %s, %s)"
cur.execute(query, ("some words", "some more text", 12))

## commit the changes
conn.commit()

In [19]:
cur = conn.cursor()
cur.execute(
    """
    CREATE SCHEMA IF NOT EXISTS wiki
    """)

cur.execute(
    """
    CREATE TABLE IF NOT EXISTS wiki.wiki (
                id SERIAL PRIMARY KEY,
                topic VARCHAR(25) NOT NULL,
                title VARCHAR(256) NOT NULL,
                content VARCHAR(100) NOT NULL,
                num_links INTEGER)
    """)

In [20]:
cur = conn.cursor()
cur.execute("SELECT * FROM test.conn_test;")
cur.fetchall()

[(1, 'some words', 'some more text', 12)]

In [21]:
## close communication with the PostgreSQL database server
cur.close()
conn.close()