# Web Data Scraping Project
## Sofía Aceves

In [2]:
import time 
import re
import sys
import requests
import numpy as np
import pandas as pd
from pathlib import Path

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

In [14]:
oer_commons = requests.get('https://www.oercommons.org/').content

try:
    response = requests.get('https://www.oercommons.org/')
    if response.status_code < 300:
        print('request was successful') 
    elif response.status_code >= 400 and response.status_code < 500:
        print('request failed because the resource either does not exist or is forbidden')
    else:
        print('request failed because the response server encountered an error')
except response.exceptions.Timeout:
    print("Timeout error")
except requests.exceptions.TooManyRedirects:
    print("Too many redirects error")
except requests.exceptions.SSLError:
    print("Security error: no security certificate")

resource_selection = BeautifulSoup(oer_commons, 'html')

request was successful


In [5]:
url = 'https://www.oercommons.org/'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
search_button = driver.find_element_by_class_name("btn.btn-big-link.btn-search")
search_button.click()
time.sleep(10)
load_more_button = driver.find_element_by_class_name('load-more-link.js-load-more')
load_more_button.click()
time.sleep(3)
load_more_button = driver.find_element_by_class_name('load-more-link.js-load-more')
load_more_button.click()
time.sleep(3)
page_source = driver.page_source
driver.close()


#Final option MVP: Exitosa
#search_button = driver.find_element_by_class_name("btn.btn-big-link.btn-search")
#search_button.click()
#time.sleep(10)
#load_more_button = driver.find_element_by_class_name('load-more-link.js-load-more')
#load_more_button.click()
#time.sleep(3)
#load_more_button = driver.find_element_by_class_name('load-more-link.js-load-more')
#load_more_button.click()
#time.sleep(3)

#INTENTO DESESPERADO: Sin éxito, element not interactable
#filter_button = driver.find_element_by_xpath("//input[@value='applied-science']")
#filter_button.click()
#page_source = driver.page_source
#driver.close()

# INTENTO 1: Sin éxito, lanza error de element not interactable: Element is not currently visible and may not be manipulated
#elem = Select(driver.find_element_by_name('f.general_subject'))
#elem.select_by_value('applied-science')
#search_button = driver.find_element_by_class("btn btn-big-link btn-search")
#search_button.click()

# INTENTO 2: Sin éxito, lanza error de element not interactable
#elem = driver.find_element_by_name('f.search')
#driver.execute_script("arguments[0].click();", elem)
#elem.send_keys("science")
#elem.send_keys(Keys.RETURN)
#assert "No results found." not in driver.page_source
#time.sleep(2)
#page_source = driver.page_source
#driver.close()

# INTENTO 3: Sin éxito, Timeout
#wait = WebDriverWait(driver, 20)
#element = wait.until(EC.element_to_be_clickable((By.NAME, 'f.general_subject')))
#actionChains = ActionChains(driver)
#actionChains.move_to_element(element).click().perform()

# INTENTO 4: Sin éxito, lanza error de element not interactable: [object HTMLSelectElement] has no size and location
# elem = driver.find_element_by_name('f.general_subject')
# driver.implicitly_wait(10)
# ActionChains(driver).move_to_element(elem).click(elem).perform()

#INTENTO 5: Sin éxito, lanza error de Timeout
#elem = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//select[@name='f.general_subject']/option[@value='applied-science']")))
#elem.click()

#INTENTO 6: Sugerido por Pons, lanza error de element not interactable
#subject_select = driver.find_element_by_xpath("//select[@name='f.general_subject']")
#subject_select.send_keys("Applied")

#OTROS INTENTOS:
#elem = Select(driver.find_element_by_name('f.general_subject'))
#elem.select_by_value('applied-science')
#elem.select_by_visible_text('Applied Science')
#print(elem.options)
#for opt in elem.options:
#            elem.select_by_value('applied-science')
#search_button = driver.find_element_by_class("btn btn-big-link btn-search")
#search_button.click()

#test_selection = BeautifulSoup(page_source, "html")
#test_selection

#dropdown1 = Select(driver.find_element_by_id('lang1'))
#dropdown1.select_by_visible_text('Java')

In [7]:
resource_selection = BeautifulSoup(page_source, 'html')
resource_selection

<html class="js canvas canvastext postmessage history backgroundsize borderradius boxshadow textshadow cssgradients video audio svg svgclippaths" lang="en"><head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="Yq6bU_Y9TiGwfN5U4awVziY7N89TOM8TzDLuSrTeg3c" name="google-site-verification"/>
<title>Search Results | OER Commons</title>
<meta content="OER Commons" property="og:site_name"/>
<meta content="Search Results" property="og:title"/>
<meta content="https://www.oercommons.org/static/newdesign/images/logo-hidpi-square.png?b9a5b8a31e3c" property="og:image"/>
<link href="/static/images/favicon.ico?5ea7dcf3fcd2" rel="shortcut icon"/>
<link href="//cdnjs.cloudflare.com/ajax/libs/cc-icons/1.2.1/css/cc-icons.min.css" rel="stylesheet"/>
<link href="/static/CACHE/css/93d100e6e2b4.css" rel="stylesheet" type="text/css"/>
<link href="/static/CACHE/css/683fc01f7179.css" rel="stylesheet" type="text/css"/>
<script async="" id="www-widgeta

In [8]:
resource_titles = resource_selection.select('a.item-link.js-item-link')
titles = [title.text for title in resource_titles]
titles

['01.04.2020.pdf',
 '#01 Java Tutorial: Unser Hello World Programm',
 '#02 Java Tutorial: Methoden / Funktionen',
 '#03 Java Tutorial: Variablen',
 '#04 Java Tutorial: Schleifen / Loops',
 '#05 Java Tutorial: Fallunterscheidung / if',
 '#06 Java Tutorial: Klassen',
 '#07 Java Tutorial: Vererbung',
 '#08 Java Tutorial: Dynamische Datenstrukturen',
 '#09 Java Tutorial: Rekursion',
 '0-Kindergarten Eureka Math',
 '100 Free Web Tools for Elementary Teachers',
 '100 People: A World Portrait',
 '100 Word Memoir (OER Commons Version)',
 '100th Day of School',
 '100th Day of School Activities',
 '101 Ways To Kickstart Your Day In A Positive Way',
 '10.2 SQ 3. What points of view did Enlightenment Thinkers have about government?',
 '10 Amazing Science Tricks Using Liquid',
 '10 FRED Activities in 10 Minutes',
 '#10 Java Tutorial: Interfaces #neue Version',
 '#10 Randomized Synthesis Project',
 '10 Steps to Start Your Business',
 '10 Things You Can Do with ArcGIS Online and Story Maps',
 '10X Bi

In [36]:
resource_subjects = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
subjects = [re.findall('(?<=Subject:).*(?=Material)',subject.text) for subject in resource_subjects]
subjects_clean = [subject for elem in subjects for subject in elem]
subjects_clean

['Information Science',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Career and Technical Education',
 'Mathematics',
 'Education',
 'World CulturesWorld HistorySocial ScienceCultural Geography',
 'English Language ArtsComposition and RhetoricReading Literature',
 'Mathematics',
 'Mathematics',
 'Health, Medicine and NursingCommunicationEducationPsychology',
 'World History',
 'Applied Science',
 'Economics',
 'Career and Technical Education',
 'Computer Science',
 'Business and Communication',
 'Physical Geography',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Mathematics',
 'Life Science',
 'Education',
 

In [35]:
resource_material_type = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
material_type = [re.findall('(?<=Material Type:).*(?=Provider:)|(?<=Material Type:).*(?=Author)|(?<=Material Type:).*(?=Date)',material.text) for material in resource_material_type]
material_type_clean = [material for elem in material_type for material in elem]
material_type_clean


['Unit of Study',
 'Lesson',
 'Unit of Study',
 'Lesson',
 'Lesson',
 'Lesson',
 'Lesson',
 'Lecture',
 'Lecture',
 'Lecture',
 'Activity/LabAssessment',
 'Reading',
 'Activity/LabDiagram/IllustrationInteractiveLessonReadingTeaching/Learning Strategy',
 'Homework/Assignment',
 'Interactive',
 'Activity/Lab',
 'Teaching/Learning Strategy',
 'Primary Source',
 'Lesson',
 'Activity/LabLessonLesson Plan',
 'Lecture',
 'Activity/LabLesson',
 'Full Course',
 'Activity/LabData Set',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabLesson PlanTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Strategy',
 'Activity/LabTeaching/Learning Str

In [33]:
resource_date = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
dates = [re.findall('\d{2}\/\d{2}\/\d{4}',date.text) for date in resource_date]
dates_clean = [date for elem in dates for date in elem]
dates_clean

['05/29/2020',
 '06/16/2015',
 '06/18/2015',
 '06/18/2015',
 '06/18/2015',
 '06/18/2015',
 '06/18/2015',
 '01/01/2010',
 '01/01/2010',
 '01/01/2010',
 '04/07/2021',
 '10/17/2014',
 '01/31/2018',
 '05/11/2021',
 '10/11/2020',
 '02/16/2011',
 '07/31/2020',
 '08/29/2018',
 '02/01/2016',
 '09/11/2019',
 '01/01/2010',
 '09/23/2019',
 '10/09/2018',
 '10/30/2017',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '08/08/2019',
 '01/28/2016',
 '08/08/2019',
 '11/28/2017',
 '01/28/2016',
 '08/08/2019',
 '10/15/2014',
 '07/24/2008',
 '07/23/2018',
 '02/19/2020',
 '08/08/2019',
 '08/08/2019',
 '02/03/2018',
 '09/23/2019',
 '05/10/2019',
 '08/22/2019',
 '01/01/2010',
 '08/05/2019',
 '09/24/2020',
 '08/05/2019',
 '08/06/2019',
 '08/27/2019',
 '08/05/2019',
 '08/05/2019',
 '08/05/2019',
 '09/23/2019',
 '01/01/2010']

In [39]:
resource_author = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
authors = [re.findall('(?<=Author:).*(?=Date)',author.text) for author in resource_author]
# Intenando volverlo sólo string, pero me encuentro el problema de que con list comp automaticamente me borra los datos vacíos y no quiero eso
authors_clean = [author for elem in authors for author in elem]
authors

[['Dr. Bharat Singh Meena'],
 ['Jörg Amelunxen'],
 ['Jörg Amelunxen'],
 ['Jörg Amelunxen'],
 [],
 ['Jörg Amelunxen'],
 ['Jörg Amelunxen'],
 ['JavaWeb and more (Jörg Amelunxen)'],
 ['JavaWeb and more (Jörg Amelunxen)'],
 ['JavaWeb and more (Jörg Amelunxen)'],
 ['Liberty Public Schools'],
 ['John Costilla'],
 [],
 ['Sarah Lyons'],
 ['Drew Penn'],
 ['Terry Kawas'],
 ['Susan Spellman CannErin Luong'],
 [],
 [],
 ['Mark Bayles'],
 ['JavaWeb and more (Jörg Amelunxen)'],
 ['Boot up PD'],
 [],
 ['Joseph J. Kerski'],
 ['Konrad Z'],
 ['Nancy Edwards'],
 ['Damien Toh'],
 ['JR Dingwall'],
 ['Jody Bauer'],
 ['Sayak Bhattacharyya'],
 ['Robyn Vsetecka'],
 ['Raquel Vazquez'],
 ['Keith  Mann'],
 ['Anne  Collier'],
 ['Admin'],
 ['Laurie Wyatt'],
 ['Carmen Blackley'],
 ['John Kinney'],
 ['Crystal HurtBeth Kabes'],
 ['Christopher Arnett'],
 ['U.S. Department of EducationWNET'],
 ['Beth Kabes'],
 ['NAJMUDDDEEN ALHASSAN'],
 ['Douglas Hathaway'],
 ['Antonio Archetti'],
 ['Paul Hudson'],
 ['Boot up PD'],
 ['C

In [41]:
educational_resources_df = pd.DataFrame(list(zip(titles,subjects_clean,material_type_clean,dates_clean,authors)),columns=['Title','Subject','Material Type', 'Date Added', 'Authors'])
educational_resources_df.head(25)

Unnamed: 0,Title,Subject,Material Type,Date Added,Authors
0,01.04.2020.pdf,Information Science,Unit of Study,05/29/2020,[Dr. Bharat Singh Meena]
1,#01 Java Tutorial: Unser Hello World Programm,Career and Technical Education,Lesson,06/16/2015,[Jörg Amelunxen]
2,#02 Java Tutorial: Methoden / Funktionen,Career and Technical Education,Unit of Study,06/18/2015,[Jörg Amelunxen]
3,#03 Java Tutorial: Variablen,Career and Technical Education,Lesson,06/18/2015,[Jörg Amelunxen]
4,#04 Java Tutorial: Schleifen / Loops,Career and Technical Education,Lesson,06/18/2015,[]
5,#05 Java Tutorial: Fallunterscheidung / if,Career and Technical Education,Lesson,06/18/2015,[Jörg Amelunxen]
6,#06 Java Tutorial: Klassen,Career and Technical Education,Lesson,06/18/2015,[Jörg Amelunxen]
7,#07 Java Tutorial: Vererbung,Career and Technical Education,Lecture,01/01/2010,[JavaWeb and more (Jörg Amelunxen)]
8,#08 Java Tutorial: Dynamische Datenstrukturen,Career and Technical Education,Lecture,01/01/2010,[JavaWeb and more (Jörg Amelunxen)]
9,#09 Java Tutorial: Rekursion,Career and Technical Education,Lecture,01/01/2010,[JavaWeb and more (Jörg Amelunxen)]


In [11]:
# your code here
import time 
import re
import sys
import requests
import numpy as np
import pandas as pd
from pathlib import Path

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By

url = 'https://www.oercommons.org/'

def review_url(url):
    try:
        response = requests.get(url)
        if response.status_code < 300:
            print('request was successful')
        elif response.status_code >= 400 and response.status_code < 500:
            print('request failed because the resource either does not exist or is forbidden')
        else:
            print('request failed because the response server encountered an error')
    except response.exceptions.Timeout:
        print("Timeout error")
    except requests.exceptions.TooManyRedirects:
        print("Too many redirects error")
    except requests.exceptions.SSLError:
        print("Security error: no security certificate")
        
def scrape_url(url):
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(2)
    search_button = driver.find_element_by_class_name("btn.btn-big-link.btn-search")
    search_button.click()
    time.sleep(10)
    load_more_button = driver.find_element_by_class_name('load-more-link.js-load-more')
    load_more_button.click()
    time.sleep(3)
    load_more_button = driver.find_element_by_class_name('load-more-link.js-load-more')
    load_more_button.click()
    time.sleep(3)
    page_source = driver.page_source
    driver.close()
    return(page_source)

def data_parser(page_source):
    resource_selection = BeautifulSoup(page_source,'html')
    resource_titles = resource_selection.select('a.item-link.js-item-link')
    titles_clean = [title.text for title in resource_titles]
    resource_subjects = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
    subjects = [re.findall('(?<=Subject:).*(?=Material)',subject.text) for subject in resource_subjects]
    subjects_clean = [subject for elem in subjects for subject in elem]
    resource_material_type = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
    material_type = [re.findall('(?<=Material Type:).*(?=Provider:)|(?<=Material Type:).*(?=Author)|(?<=Material Type:).*(?=Date)',material.text) for material in resource_material_type]
    material_type_clean = [material for elem in material_type for material in elem]
    resource_date = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
    dates = [re.findall('\d{2}\/\d{2}\/\d{4}',date.text) for date in resource_date]
    dates_clean = [date for elem in dates for date in elem]
    resource_author = resource_selection.select('dl.item-info.visible-md-block.visible-lg-block')
    authors_clean = [re.findall('(?<=Author:).*(?=Date)',author.text) for author in resource_author]
    # Intenando volverlo sólo string, pero me encuentro el problema de que con list comp automaticamente me borra los datos vacíos y no quiero eso
    # authors_clean = [author for elem in authors for author in elem]
    final_data = list(zip(titles_clean, subjects_clean,material_type_clean,dates_clean,authors_clean))
    return final_data

def df_organization(final_data):
    educational_resources_df = pd.DataFrame(final_data ,columns=['Title','Subject','Material Type', 'Date Added', 'Authors'])
    return educational_resources_df 
    

def final_df(path):
    
    check_url = review_url(url)
    
    page_source = scrape_url(url)
    print ('successfull scraping')
    
    final_data = data_parser(page_source)
    print('successfull parsing')
    
    educational_resources_df = df_organization(final_data)
    print("successfull df")

    educational_resources_df.to_csv(path)

    return educational_resources_df 

In [12]:
final_educational_resources_df = final_df('C:/Users/Sofía Aceves Osuna/ironhack/project-data-extraction/educational_resources_oer_df.csv')

request was successful
successfull scraping
successfull parsing
successfull df
