## Web scraping

In [6]:
# !pip install selenium
#!pip install webdriver_manager

## The following is code to run a python controlled chrome browser environment

### We will

### 1. Open the wikipedia page for someone
### 2. Gather all the links in the page
### 3. Maintain a set of links that we visited, we start visiting unvisited webpages
### 4. Extract text from each page
### 5. Filter clean the text

## Download chromedriver [here](https://googlechromelabs.github.io/chrome-for-testing)

Keep the chromedriver binary in the same directory as the jupyter notebook


In [21]:
import time
import itertools
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

## Add the path and user profile to chromedriver. You should add your own path. User profile is optional.
chrome_driver_path = "/Users/hardik/hardik-projects/secretllm/05-data-collection/chromedriver-mac-arm64/chromedriver"
chromium_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
user_profile_path = "/Users/hardik/Library/Application Support/Google/Chrome/Default"

chrome_options = Options()
chrome_options.binary_location = chromium_path
chrome_options.add_argument(f"user-data-dir={user_profile_path}")

# Set up the Chrome driver
service = Service("/Users/hardik/hardik-projects/secretllm/05-data-collection/chromedriver-mac-arm64/chromedriver")
driver = webdriver.Chrome(service=service, options=chrome_options)

In [22]:
# Open a website
driver.get("https://wikipedia.org/wiki/Dennis_Ritchie")


In [23]:
h2_element = driver.find_element(By.ID, 'mw-content-text')
h2_text = h2_element.text
print(h2_text)

Dennis Ritchie
Dennis Ritchie at the Japan Prize Foundation in May 2011
Born Dennis MacAlistair Ritchie
September 9, 1941
Bronxville, New York, U.S.
Died c. October 12, 2011 (aged 70)
Berkeley Heights, New Jersey, U.S.
Alma mater Harvard University (BS)
Known for ALTRAN
B
BCPL
C
Multics
Unix
Awards IEEE Emanuel R. Piore Award (1982)[1]
Turing Award (1983)
National Medal of Technology (1998)
IEEE Richard W. Hamming Medal (1990)
Computer Pioneer Award (1994)
Computer History Museum Fellow (1997)[2]
Harold Pender Award (2003)
Japan Prize (2011)
Scientific career
Fields Computer science
Institutions Lucent Technologies
Bell Labs
Doctoral advisor Patrick C. Fischer
Website bell-labs.com/usr/dmr/www
Dennis MacAlistair Ritchie (September 9, 1941 – c. October 12, 2011) was an American computer scientist.[3] He created the C programming language and, with long-time colleague Ken Thompson, the Unix operating system and B language.[3] Ritchie and Thompson were awarded the Turing Award from the As

## We find all urls in the page

In [24]:
anchor_elements = driver.find_elements(By.TAG_NAME, "a")
# Extract the href attribute from each anchor element
urls = [anchor.get_attribute("href") for anchor in anchor_elements]
url_set = set(urls)        

In [25]:
from urllib.parse import urlparse, urlunparse

def remove_url_fragments(url_set):
  cleaned_urls = set()
  for url in url_set:
    parsed_url = urlparse(url)
    cleaned_url = urlunparse(parsed_url._replace(fragment=''))
    if cleaned_url is not None and cleaned_url not in cleaned_urls and cleaned_url != b'':
      cleaned_urls.add(cleaned_url)
  return cleaned_urls

cleaned_url_set = remove_url_fragments(url_set)
print(cleaned_url_set)

{'https://en.wikipedia.org/wiki/Alfred_Aho', 'https://en.wikipedia.org/wiki/Frances_Allen', 'https://ro.wikipedia.org/wiki/Dennis_Ritchie', 'https://ja.wikipedia.org/wiki/%E3%83%87%E3%83%8B%E3%82%B9%E3%83%BB%E3%83%AA%E3%83%83%E3%83%81%E3%83%BC', 'https://en.wikipedia.org/wiki/Jerry_Saltzer', 'https://en.wikipedia.org/wiki/Benoit_Mandelbrot', 'https://en.wikipedia.org/wiki/Borland_Turbo_C', 'https://en.wikipedia.org/wiki/Richard_Hamming', 'https://en.wikipedia.org/wiki/Phoronix', 'https://en.wikipedia.org/wiki/File:Dennis_Ritchie_2011.jpg', 'https://en.wikipedia.org/wiki/Solomon_W._Golomb', 'https://en.wikipedia.org/wiki/Android_(operating_system)', 'https://en.wikipedia.org/wiki/Tiny_C_Compiler', 'https://en.wikipedia.org/wiki/Pat_Hanrahan', 'https://en.wikipedia.org/wiki/Nick_Holonyak', 'https://doi.org/10.1145%2F1283920.1283939', 'https://en.wikipedia.org/wiki/Lucent_Technologies', 'https://en.wikipedia.org/wiki/Erdal_Ar%C4%B1kan', 'https://de.wikipedia.org/wiki/Dennis_Ritchie', 'htt

### We only visit webpages which we didn't visit in the past and collect text information from the page

In [26]:
visited_urls = set()
page_text_dict = dict()
# Loop through the URLs
for url in itertools.islice(cleaned_url_set, 10): # Limit the number of URLs visited for demonstration purposes
    if url is not None and url not in visited_urls:
        # Visit the URL
        driver.get(url)
        # Extract the page text
        page_text = driver.find_element(By.TAG_NAME, "body").text
        # Store the page text in the dictionary
        page_text_dict[url] = page_text
        visited_urls.add(url)
        

In [27]:
print(page_text_dict.values())
print(page_text_dict.keys())

dict_values(['Jump to content\nMain menu\nSearch\nDonate\nCreate account\nLog in\nPersonal tools\nContents hide\n(Top)\nCareer\nTeaching\nBooks\nReferences\nExternal links\nAlfred Aho\n32 languages\nArticle\nTalk\nRead\nEdit\nView history\nTools\nAppearance hide\nFrom Wikipedia, the free encyclopedia\nAlfred Aho\nBorn Alfred Vaino Aho\nAugust 9, 1941 (age 83)\nTimmins, Ontario, Canada\nNationality Canadian\nAmerican\nAlma mater\nUniversity of Toronto\nPrinceton University\nKnown for\nAwk programming language\nPrinciples of Compiler Design\nCompilers: Principles, Techniques, and Tools\nAho-Corasick algorithm\nAwards\nBell Labs Fellow (1984)\nFAAAS (1986)\nIEEE Fellow (1988)\nFACM (1996)\nIEEE John von Neumann Medal (2003)\nNAE Member\nNAS Member\nTuring Award (2020)\nScientific career\nFields Computer science\nInstitutions Columbia University\nThesis Indexed Grammars: An Extension of Context Free Grammars  (1968)\nDoctoral advisor John Hopcroft[1]\nDoctoral students\nKrysta Svore\nGaura

In [29]:
# Change the following line to print the page text for a different URL
collected_url = 'https://en.wikipedia.org/wiki/Alfred_Aho'
print(page_text_dict[collected_url])

Jump to content
Main menu
Search
Donate
Create account
Log in
Personal tools
Contents hide
(Top)
Career
Teaching
Books
References
External links
Alfred Aho
32 languages
Article
Talk
Read
Edit
View history
Tools
Appearance hide
From Wikipedia, the free encyclopedia
Alfred Aho
Born Alfred Vaino Aho
August 9, 1941 (age 83)
Timmins, Ontario, Canada
Nationality Canadian
American
Alma mater
University of Toronto
Princeton University
Known for
Awk programming language
Principles of Compiler Design
Compilers: Principles, Techniques, and Tools
Aho-Corasick algorithm
Awards
Bell Labs Fellow (1984)
FAAAS (1986)
IEEE Fellow (1988)
FACM (1996)
IEEE John von Neumann Medal (2003)
NAE Member
NAS Member
Turing Award (2020)
Scientific career
Fields Computer science
Institutions Columbia University
Thesis Indexed Grammars: An Extension of Context Free Grammars  (1968)
Doctoral advisor John Hopcroft[1]
Doctoral students
Krysta Svore
Gaurav Kc
Marc Eaddy
Marcio Buss
Alfred Vaino Aho (born August 9, 1941) i

In [30]:
# Clean the extracted text
def clean_text(text):
    # Remove leading and trailing white spaces
    text = text.strip()
    # Remove extra white spaces
    text = " ".join(text.split())
    # remove special characters
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    # remove "[edit] from the text"
    text = text.replace("[edit]", "")
    # Remove Main menu, Search, Donate, Create account, Log in, Personal tools, Contents hide, (Top), History, More, Jump to content Top  Toggle
    text = text.replace("Jump to content", "")
    text = text.replace("Top", "")
    text = text.replace("Toggle", "")
    text = text.replace("subsection", "")
    text = text.replace("Main menu", "")
    text = text.replace("Search", "")
    text = text.replace("Donate", "")
    text = text.replace("Create account", "")
    text = text.replace("Log in", "")
    text = text.replace("Personal tools", "")
    text = text.replace("Contents hide", "")
    text = text.replace("(Top)", "")
    text = text.replace("History", "")
    
    return text

for url, text in page_text_dict.items():
    page_text_dict[url] = clean_text(text)

In [31]:
print(page_text_dict[collected_url])

         Career Teaching Books References External links Alfred Aho 32 languages Article Talk Read Edit View history Tools Appearance hide From Wikipedia the free encyclopedia Alfred Aho Born Alfred Vaino Aho August 9 1941 age 83 Timmins Ontario Canada Nationality Canadian American Alma mater University of Toronto Princeton University Known for Awk programming language Principles of Compiler Design Compilers Principles Techniques and Tools AhoCorasick algorithm Awards Bell Labs Fellow 1984 FAAAS 1986 IEEE Fellow 1988 FACM 1996 IEEE John von Neumann Medal 2003 NAE Member NAS Member Turing Award 2020 Scientific career Fields Computer science Institutions Columbia University Thesis Indexed Grammars An Extension of Context Free Grammars 1968 Doctoral advisor John Hopcroft1 Doctoral students Krysta Svore Gaurav Kc Marc Eaddy Marcio Buss Alfred Vaino Aho born August 9 1941 is a Canadian computer scientist best known for his work on programming languages compilers and related algorithms and h

### Now we store the extracted text

In [32]:
with open ('extracted.txt', 'w') as f:
  for url, text in page_text_dict.items():
    f.write(f"URL: {url}\n")
    f.write(f"Text: {text}\n\n")

In [33]:
driver.quit()