In [3]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import random

import os

import dotenv

In [1]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36 OPR/47.0.2631.39",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4",
    "My User Agent 1.0",
]

In [5]:
import requests
from bs4 import BeautifulSoup
import random
from pydantic import BaseModel
from typing import Optional
from datetime import date, datetime
import json


class BBCModel(BaseModel):
    url: str
    category: Optional[str] = "General"
    date: date
    header: Optional[str]
    body: Optional[str]


class BBC:

    def __init__(self, url, category="General"):
        self.url = url
        self.category = category
        self.date = date.today()
        self.header = None
        self.body = None
        self.soup = None
        self.scrape()

    def scrape(self):
        headers = {"User-Agent": random.choice(user_agents)}

        response = requests.get(self.url, headers=headers)
        self.soup = BeautifulSoup(response.content, "html.parser")
        self.header = self.get_header(self.soup)
        self.body = self.get_body(self.soup)

    def get_header(self, soup):
        # Adjust the CSS selector according to the actual HTML structure
        header_tag = soup.find("h1")
        if header_tag:
            return header_tag.get_text().strip()
        return None

    def get_body(self, soup):
        text_block_div = soup.find("div", {"data-component": "text-block"})
        if text_block_div:
            paragraphs = text_block_div.find_all("p")
            return " ".join([p.get_text().strip() for p in paragraphs])
        return None

    def get_url(self):
        return self.url

    def get_category(self):
        return self.category

    def to_json(self):
        date_datetime = datetime(self.date.year, self.date.month, self.date.day)
        return {
            "url": self.get_url(),
            "category": self.get_category(),
            "date": date_datetime,
            "header": self.get_header(self.soup),
            "body": self.get_body(self.soup),
        }

In [38]:
sjsu_page = "https://www.sjsu.edu/"

In [39]:
def scrape_page():
    print("Scraping Page...")
    url = "https://www.sjsu.edu/"
    headers = {"User-Agent": random.choice(user_agents)}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    with open("scraped_content.txt", "w", encoding="utf-8") as file:
        file.write(soup.get_text())

scrape_page()    

Scraping Page...


In [40]:
def scrape_links_from_page():
    print("Scraping Page...")
    url = "https://www.sjsu.edu/"
    headers = {"User-Agent": random.choice(user_agents)}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [a.get('href') for a in soup.find_all('a', href=True)]
    # return links
    print(len(links))
    # for link in links:
    #     print(link)
    return links
all_links= scrape_links_from_page()

Scraping Page...
161


In [41]:
def separate_links(links):
    http_links = [link for link in links if link.startswith('http')]
    non_http_links = [link for link in links if not link.startswith('http')]
    return http_links, non_http_links

http_links, non_http_links = separate_links(all_links)
print("HTTP Links: ", len(http_links))
print("Non-HTTP Links: ", len(non_http_links))

print("HTTP Links: ", non_http_links)


HTTP Links:  146
Non-HTTP Links:  15
HTTP Links:  ['#maincontent', '/', '/visit/index.php', '/admissions/index.php', '/giving/', '/online/index.php?utm_source=sjsumainmenu&utm_medium=web&utm_campaign=sjsuonline-navmenu-traffic', '/giving/', '/index.php', '/online/index.php?utm_source=sjsumainmenu-mobile&utm_medium=web&utm_campaign=sjsuonline-navmenu-traffic', '/about/', '', '/facts-and-accomplishments/', '/online/', 'tel:4089241000', '/online/index.php']


In [46]:
def add_prefix_to_links(links, prefix):
    return [prefix + link for link in links]

final_non_http_links = add_prefix_to_links(non_http_links, sjsu_page)

# print("Final Non-HTTP Links: ", final_non_http_links)


final_links = final_non_http_links + http_links
final_links = [link.replace("https://one.sjsu.edu", "").strip() for link in final_links]
final_links = [link for link in final_links if link]
# Save links to a text file
with open("final_links.txt", "w") as f:
    for link in final_links:
        f.write(link + "\n")

print("Final Links: ", len(final_links))

Final Links:  160


In [47]:
print("Final Links: ", final_links[59])

Final Links:  http://www.sjsu.edu/search/index.html


In [48]:
from tqdm import tqdm  # Import tqdm


def scrape_all_pages(links):
    if (links=="https://one.sjsu.edu"):
        return
    all_pages_content = []
    for link in tqdm(links, desc="Scraping Pages"):  # Wrap links with tqdm
        print("Scraping Page: ", link)
        if (link=="https://one.sjsu.edu"):
            return
        headers = {"User-Agent": random.choice(user_agents)}
        response = requests.get(link, headers=headers)
        soup = BeautifulSoup(response.content, "html.parser")
        all_pages_content.append(soup)
    return all_pages_content


all_pages_content = scrape_all_pages(final_links)

# Save the scraped content to a text file
with open("scraped_pages_content.txt", "w", encoding="utf-8") as file:
    for page in all_pages_content:
        file.write(page.get_text() + "\n\n")

Scraping Pages:   0%|          | 0/160 [00:00<?, ?it/s]

Scraping Page:  https://www.sjsu.edu/#maincontent


Scraping Pages:   1%|▏         | 2/160 [00:00<00:31,  4.96it/s]

Scraping Page:  https://www.sjsu.edu//
Scraping Page:  https://www.sjsu.edu//visit/index.php


Scraping Pages:   2%|▏         | 3/160 [00:00<00:25,  6.04it/s]

Scraping Page:  https://www.sjsu.edu//admissions/index.php


Scraping Pages:   3%|▎         | 5/160 [00:00<00:31,  5.00it/s]

Scraping Page:  https://www.sjsu.edu//giving/


Scraping Pages:   4%|▍         | 6/160 [00:01<00:30,  5.13it/s]

Scraping Page:  https://www.sjsu.edu//online/index.php?utm_source=sjsumainmenu&utm_medium=web&utm_campaign=sjsuonline-navmenu-traffic


Scraping Pages:   4%|▍         | 7/160 [00:01<00:25,  5.91it/s]

Scraping Page:  https://www.sjsu.edu//giving/
Scraping Page:  https://www.sjsu.edu//index.php


Scraping Pages:   6%|▌         | 9/160 [00:01<00:30,  5.01it/s]

Scraping Page:  https://www.sjsu.edu//online/index.php?utm_source=sjsumainmenu-mobile&utm_medium=web&utm_campaign=sjsuonline-navmenu-traffic


Scraping Pages:   6%|▋         | 10/160 [00:01<00:25,  5.80it/s]

Scraping Page:  https://www.sjsu.edu//about/
Scraping Page:  https://www.sjsu.edu/


Scraping Pages:   8%|▊         | 12/160 [00:02<00:26,  5.49it/s]

Scraping Page:  https://www.sjsu.edu//facts-and-accomplishments/
Scraping Page:  https://www.sjsu.edu//online/


Scraping Pages:   9%|▉         | 14/160 [00:02<00:30,  4.82it/s]

Scraping Page:  https://www.sjsu.edu/tel:4089241000


Scraping Pages:   9%|▉         | 15/160 [00:02<00:27,  5.22it/s]

Scraping Page:  https://www.sjsu.edu//online/index.php
Scraping Page:  https://www.sjsu.edu/visit/index.php


Scraping Pages:  11%|█         | 17/160 [00:03<00:24,  5.88it/s]

Scraping Page:  https://www.sjsu.edu/soar/services/campus-tours.php
Scraping Page:  http://www.sjsu.edu/map


Scraping Pages:  11%|█▏        | 18/160 [00:03<00:26,  5.36it/s]

Scraping Page:  https://www.sjsu.edu/parking/


Scraping Pages:  12%|█▎        | 20/160 [00:03<00:32,  4.35it/s]

Scraping Page:  https://www.sjsu.edu/visit/silicon-valley.php
Scraping Page:  https://hammertheatre.com/


Scraping Pages:  14%|█▍        | 22/160 [00:04<00:31,  4.40it/s]

Scraping Page:  https://www.sjsu.edu/sjsulovessj/
Scraping Page:  https://www.sjsu.edu/academics/index.php


Scraping Pages:  14%|█▍        | 23/160 [00:04<00:27,  4.89it/s]

Scraping Page:  https://www.sjsu.edu/academics/colleges-and-departments.php
Scraping Page:  https://catalog.sjsu.edu/content.php?catoid=14&navoid=5107


Scraping Pages:  16%|█▌        | 25/160 [00:05<00:47,  2.84it/s]

Scraping Page:  https://www.sjsu.edu/classes/calendar/index.php


Scraping Pages:  17%|█▋        | 27/160 [00:06<00:37,  3.54it/s]

Scraping Page:  https://www.sjsu.edu/classes/index.php
Scraping Page:  http://library.sjsu.edu/


Scraping Pages:  18%|█▊        | 29/160 [00:06<00:39,  3.28it/s]

Scraping Page:  https://www.sjsu.edu/admissions/index.php


Scraping Pages:  19%|█▉        | 30/160 [00:06<00:33,  3.84it/s]

Scraping Page:  https://www.sjsu.edu/tuition-and-fees/index.php
Scraping Page:  https://www.sjsu.edu/housing-options/index.php


Scraping Pages:  19%|█▉        | 31/160 [00:07<00:29,  4.33it/s]

Scraping Page:  https://www.sjsu.edu/professional/


Scraping Pages:  20%|██        | 32/160 [00:07<00:35,  3.58it/s]

Scraping Page:  https://www.sjsu.edu/global/


Scraping Pages:  21%|██▏       | 34/160 [00:07<00:30,  4.17it/s]

Scraping Page:  https://www.sjsu.edu/parent-and-family-programs/index.php
Scraping Page:  https://www.sjsu.edu/studentaffairs/index.php


Scraping Pages:  22%|██▏       | 35/160 [00:08<00:31,  3.99it/s]

Scraping Page:  https://www.sjsu.edu/campus-life/health-and-wellness.php
Scraping Page:  https://www.sjsu.edu/campus-life/safety.php


Scraping Pages:  24%|██▍       | 38/160 [00:08<00:23,  5.26it/s]

Scraping Page:  https://www.sjsu.edu/sjsucares/
Scraping Page:  https://www.sjsu.edu/campus-life/events.php


Scraping Pages:  25%|██▌       | 40/160 [00:08<00:22,  5.29it/s]

Scraping Page:  https://www.sjsu.edu/studentaffairs/experience-campus/index.php
Scraping Page:  https://www.sjsu.edu/campus-life/traditions.php


Scraping Pages:  26%|██▋       | 42/160 [00:09<00:20,  5.65it/s]

Scraping Page:  https://www.sjsu.edu/campus-life/sac.php
Scraping Page:  https://www.sjsu.edu/about/research/


Scraping Pages:  28%|██▊       | 44/160 [00:09<00:18,  6.31it/s]

Scraping Page:  https://www.sjsu.edu/research/
Scraping Page:  https://www.sjsu.edu/innovation/


Scraping Pages:  28%|██▊       | 45/160 [00:09<00:16,  6.81it/s]

Scraping Page:  https://www.sjsu.edu/researchfoundation/


Scraping Pages:  29%|██▉       | 47/160 [00:10<00:28,  3.92it/s]

Scraping Page:  https://www.sjsu.edu/about/research/student-faculty-collaborations.php
Scraping Page:  https://www.sjsu.edu/about/research/become-a-research-partner.php


Scraping Pages:  31%|███       | 49/160 [00:10<00:22,  4.95it/s]

Scraping Page:  https://www.sjsu.edu/research/about/cci/index.php
Scraping Page:  https://www.sjsu.edu/about/index.php


Scraping Pages:  32%|███▏      | 51/160 [00:11<00:19,  5.72it/s]

Scraping Page:  https://www.sjsu.edu/about/administration-and-leadership.php
Scraping Page:  https://www.sjsu.edu/facts-and-accomplishments/index.php


Scraping Pages:  32%|███▎      | 52/160 [00:11<00:18,  5.81it/s]

Scraping Page:  https://blogs.sjsu.edu/newsroom/


Scraping Pages:  34%|███▍      | 54/160 [00:11<00:20,  5.14it/s]

Scraping Page:  https://www.sjsu.edu/about/partnerships.php
Scraping Page:  https://www.sjsu.edu/strategicplan/


Scraping Pages:  34%|███▍      | 55/160 [00:12<00:35,  2.95it/s]

Scraping Page:  https://www.sjsu.edu/transform/


Scraping Pages:  35%|███▌      | 56/160 [00:13<00:49,  2.10it/s]

Scraping Page:  http://www.sjsuspartans.com


Scraping Pages:  36%|███▌      | 57/160 [00:13<00:51,  1.99it/s]

Scraping Page:  https://www.sjsu.edu/alumni/index.php


Scraping Pages:  36%|███▋      | 58/160 [00:14<00:44,  2.29it/s]

Scraping Page:  https://sjsu.instructure.com


Scraping Pages:  37%|███▋      | 59/160 [00:15<01:08,  1.48it/s]

Scraping Page:  http://www.sjsu.edu/search/index.html


Scraping Pages:  38%|███▊      | 60/160 [00:15<00:54,  1.84it/s]

Scraping Page:  https://www.sjsu.edu/visit/index.php


Scraping Pages:  39%|███▉      | 62/160 [00:15<00:36,  2.68it/s]

Scraping Page:  https://www.sjsu.edu/soar/services/campus-tours.php


Scraping Pages:  39%|███▉      | 63/160 [00:16<00:31,  3.05it/s]

Scraping Page:  http://www.sjsu.edu/map
Scraping Page:  https://www.sjsu.edu/parking/


Scraping Pages:  40%|████      | 64/160 [00:16<00:29,  3.20it/s]

Scraping Page:  https://www.sjsu.edu/visit/silicon-valley.php


Scraping Pages:  41%|████      | 65/160 [00:16<00:29,  3.24it/s]

Scraping Page:  https://hammertheatre.com/


Scraping Pages:  42%|████▏     | 67/160 [00:17<00:26,  3.53it/s]

Scraping Page:  https://www.sjsu.edu/sjsulovessj/


Scraping Pages:  42%|████▎     | 68/160 [00:17<00:22,  4.08it/s]

Scraping Page:  https://www.sjsu.edu/academics/index.php
Scraping Page:  https://www.sjsu.edu/academics/colleges-and-departments.php


Scraping Pages:  43%|████▎     | 69/160 [00:17<00:19,  4.57it/s]

Scraping Page:  https://catalog.sjsu.edu/content.php?catoid=14&navoid=5107


Scraping Pages:  44%|████▍     | 71/160 [00:18<00:28,  3.15it/s]

Scraping Page:  https://www.sjsu.edu/classes/calendar/index.php
Scraping Page:  https://www.sjsu.edu/classes/index.php


Scraping Pages:  45%|████▌     | 72/160 [00:18<00:23,  3.68it/s]

Scraping Page:  http://library.sjsu.edu/


Scraping Pages:  46%|████▌     | 73/160 [00:19<00:28,  3.07it/s]

Scraping Page:  https://www.sjsu.edu/admissions/index.php


Scraping Pages:  47%|████▋     | 75/160 [00:19<00:20,  4.13it/s]

Scraping Page:  https://www.sjsu.edu/tuition-and-fees/index.php
Scraping Page:  https://www.sjsu.edu/housing-options/index.php


Scraping Pages:  48%|████▊     | 76/160 [00:19<00:17,  4.91it/s]

Scraping Page:  https://www.sjsu.edu/professional/


Scraping Pages:  48%|████▊     | 77/160 [00:19<00:20,  4.00it/s]

Scraping Page:  https://www.sjsu.edu/global/


Scraping Pages:  49%|████▉     | 79/160 [00:20<00:21,  3.81it/s]

Scraping Page:  https://www.sjsu.edu/parent-and-family-programs/index.php
Scraping Page:  https://www.sjsu.edu/studentaffairs/index.php


Scraping Pages:  51%|█████     | 81/160 [00:20<00:15,  5.08it/s]

Scraping Page:  https://www.sjsu.edu/campus-life/health-and-wellness.php
Scraping Page:  https://www.sjsu.edu/campus-life/safety.php


Scraping Pages:  52%|█████▏    | 83/160 [00:21<00:19,  4.05it/s]

Scraping Page:  https://www.sjsu.edu/sjsucares/
Scraping Page:  https://www.sjsu.edu/campus-life/events.php


Scraping Pages:  52%|█████▎    | 84/160 [00:21<00:15,  4.80it/s]

Scraping Page:  https://www.sjsu.edu/studentaffairs/experience-campus/index.php


Scraping Pages:  54%|█████▍    | 86/160 [00:22<00:16,  4.46it/s]

Scraping Page:  https://www.sjsu.edu/campus-life/traditions.php
Scraping Page:  https://www.sjsu.edu/campus-life/sac.php


Scraping Pages:  55%|█████▌    | 88/160 [00:22<00:14,  4.99it/s]

Scraping Page:  https://www.sjsu.edu/about/research/


Scraping Pages:  56%|█████▌    | 89/160 [00:22<00:13,  5.08it/s]

Scraping Page:  https://www.sjsu.edu/research/


Scraping Pages:  56%|█████▋    | 90/160 [00:22<00:13,  5.30it/s]

Scraping Page:  https://www.sjsu.edu/innovation/
Scraping Page:  https://www.sjsu.edu/researchfoundation/


Scraping Pages:  57%|█████▊    | 92/160 [00:23<00:13,  5.17it/s]

Scraping Page:  https://www.sjsu.edu/about/research/student-faculty-collaborations.php
Scraping Page:  https://www.sjsu.edu/about/research/become-a-research-partner.php


Scraping Pages:  58%|█████▊    | 93/160 [00:23<00:12,  5.47it/s]

Scraping Page:  https://www.sjsu.edu/research/about/cci/index.php


Scraping Pages:  59%|█████▉    | 94/160 [00:23<00:14,  4.62it/s]

Scraping Page:  https://sjsuspartans.com/


Scraping Pages:  60%|██████    | 96/160 [00:24<00:12,  4.96it/s]

Scraping Page:  https://www.sjsu.edu/about/index.php
Scraping Page:  https://www.sjsu.edu/about/administration-and-leadership.php


Scraping Pages:  61%|██████▏   | 98/160 [00:24<00:11,  5.31it/s]

Scraping Page:  https://www.sjsu.edu/facts-and-accomplishments/index.php
Scraping Page:  https://blogs.sjsu.edu/newsroom/


Scraping Pages:  62%|██████▎   | 100/160 [00:24<00:11,  5.28it/s]

Scraping Page:  https://www.sjsu.edu/about/partnerships.php
Scraping Page:  https://www.sjsu.edu/strategicplan/


Scraping Pages:  64%|██████▍   | 102/160 [00:25<00:09,  5.96it/s]

Scraping Page:  https://www.sjsu.edu/transform/
Scraping Page:  https://www.sjsu.edu/naissc/events/native-american-heritage-month.php


Scraping Pages:  64%|██████▍   | 103/160 [00:25<00:10,  5.59it/s]

Scraping Page:  https://blogs.sjsu.edu/newsroom/2024/wall-street-journal-ranks-sjsu-the-4-public-school-in-the-country/


Scraping Pages:  65%|██████▌   | 104/160 [00:25<00:11,  4.84it/s]

Scraping Page:  https://blogs.sjsu.edu/newsroom/2024/san-jose-state-launches-the-college-of-information-data-and-society-ids/


Scraping Pages:  66%|██████▌   | 105/160 [00:26<00:14,  3.67it/s]

Scraping Page:  https://blogs.sjsu.edu/newsroom/2024/first-year-writing-program-presents-the-inaugural-digital-literacy-expo/


Scraping Pages:  67%|██████▋   | 107/160 [00:27<00:28,  1.85it/s]

Scraping Page:  https://blogs.sjsu.edu/newsroom/2024/sjsu-named-fulbright-hsi-leader-by-the-u-s-department-of-state-for-fourth-year-running/
Scraping Page:  https://blogs.sjsu.edu/newsroom/2024/san-jose-state-joins-aqueous-battery-consortium-for-clean-energy-storage/


Scraping Pages:  68%|██████▊   | 108/160 [00:28<00:24,  2.15it/s]

Scraping Page:  http://blogs.sjsu.edu/newsroom/


Scraping Pages:  68%|██████▊   | 109/160 [00:29<00:30,  1.69it/s]

Scraping Page:  https://events.sjsu.edu/event/womens-basketball-san-jose-state-vs-san-francisco-st


Scraping Pages:  69%|██████▉   | 110/160 [00:29<00:33,  1.50it/s]

Scraping Page:  https://events.sjsu.edu/event/writing-workshop-writing-concisely-fall-2024-2


Scraping Pages:  69%|██████▉   | 111/160 [00:30<00:36,  1.34it/s]

Scraping Page:  https://events.sjsu.edu/event/beading-circle-with-ohlone-cultural-bearers-898


Scraping Pages:  70%|███████   | 112/160 [00:31<00:39,  1.23it/s]

Scraping Page:  https://events.sjsu.edu/event/cabaret


Scraping Pages:  71%|███████   | 113/160 [00:32<00:33,  1.40it/s]

Scraping Page:  https://events.sjsu.edu/


Scraping Pages:  71%|███████▏  | 114/160 [00:38<01:41,  2.21s/it]

Scraping Page:  https://pages.sjsu.edu/choose-sjsu


Scraping Pages:  72%|███████▏  | 115/160 [00:38<01:19,  1.76s/it]

Scraping Page:  http://www.sjsu.edu/wsq


Scraping Pages:  72%|███████▎  | 116/160 [00:40<01:12,  1.65s/it]

Scraping Page:  https://www.juicer.io


Scraping Pages:  73%|███████▎  | 117/160 [00:40<00:57,  1.33s/it]

Scraping Page:  http://www.facebook.com/sanjosestate


Scraping Pages:  74%|███████▍  | 118/160 [00:40<00:42,  1.02s/it]

Scraping Page:  https://twitter.com/sjsu


Scraping Pages:  74%|███████▍  | 119/160 [00:41<00:36,  1.11it/s]

Scraping Page:  http://www.linkedin.com/company/san-jose-state-university


Scraping Pages:  75%|███████▌  | 120/160 [00:43<00:45,  1.14s/it]

Scraping Page:  http://www.instagram.com/sjsu


Scraping Pages:  76%|███████▌  | 121/160 [00:44<00:39,  1.01s/it]

Scraping Page:  http://www.youtube.com/user/sjsu


Scraping Pages:  76%|███████▋  | 122/160 [00:44<00:31,  1.20it/s]

Scraping Page:  https://goo.gl/maps/8PBZPkKH15WVnq396


Scraping Pages:  78%|███████▊  | 124/160 [00:46<00:30,  1.18it/s]

Scraping Page:  https://www.sjsu.edu/accessibility/index.php


Scraping Pages:  78%|███████▊  | 125/160 [00:46<00:22,  1.57it/s]

Scraping Page:  https://www.sjsu.edu/aiie/accreditation/index.php
Scraping Page:  https://www.sjsu.edu/titleixeo/index.php


Scraping Pages:  79%|███████▉  | 127/160 [00:46<00:13,  2.48it/s]

Scraping Page:  https://www.sjsu.edu/diversity/index.php
Scraping Page:  https://www.sjsu.edu/diversity/land-acknowledgement/


Scraping Pages:  81%|████████  | 129/160 [00:47<00:09,  3.33it/s]

Scraping Page:  https://www.sjsu.edu/privacy/


Scraping Pages:  81%|████████▏ | 130/160 [00:47<00:08,  3.67it/s]

Scraping Page:  https://www.sjsu.edu/cob/index.php


Scraping Pages:  82%|████████▏ | 131/160 [00:47<00:07,  4.02it/s]

Scraping Page:  https://www.sjsu.edu/education/index.php


Scraping Pages:  82%|████████▎ | 132/160 [00:47<00:06,  4.54it/s]

Scraping Page:  https://www.sjsu.edu/engineering/index.php
Scraping Page:  https://www.sjsu.edu/cgs/index.php


Scraping Pages:  83%|████████▎ | 133/160 [00:48<00:06,  4.09it/s]

Scraping Page:  https://www.sjsu.edu/chhs/index.php


Scraping Pages:  84%|████████▍ | 134/160 [00:48<00:08,  3.06it/s]

Scraping Page:  https://www.sjsu.edu/information-data-society/index.php


Scraping Pages:  85%|████████▌ | 136/160 [00:48<00:06,  3.74it/s]

Scraping Page:  https://www.sjsu.edu/humanitiesandarts/index.php
Scraping Page:  https://sjsu.edu/professional/


Scraping Pages:  86%|████████▋ | 138/160 [00:49<00:05,  3.72it/s]

Scraping Page:  https://www.sjsu.edu/science/index.php
Scraping Page:  https://www.sjsu.edu/socialsciences/index.php
Scraping Page:  http://www.sjsu.edu/siteindex


Scraping Pages:  88%|████████▊ | 140/160 [00:49<00:04,  4.42it/s]

Scraping Page:  http://www.sjsu.edu/clery/docs/SJSU-Annual-Security-Report.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Scraping Pages:  88%|████████▊ | 141/160 [00:51<00:10,  1.88it/s]

Scraping Page:  http://www.spartanbookstore.com/


Scraping Pages:  89%|████████▉ | 143/160 [00:51<00:06,  2.63it/s]

Scraping Page:  https://www.sjsu.edu/classes/calendar/index.php
Scraping Page:  https://www.sjsu.edu/careers/index.php


Scraping Pages:  90%|█████████ | 144/160 [00:52<00:05,  3.06it/s]

Scraping Page:  https://catalog.sjsu.edu/index.php


Scraping Pages:  91%|█████████ | 145/160 [00:52<00:06,  2.17it/s]

Scraping Page:  https://www.sjsu.edu/students/
Scraping Page:  https://www.sjsu.edu/sjsucares/get-assistance/


Scraping Pages:  92%|█████████▎| 148/160 [00:53<00:03,  3.72it/s]

Scraping Page:  https://www.sjsu.edu/up/index.php
Scraping Page:  https://www.sjsu.edu/president/priorities-and-initiatives/free-speech/index.php


Scraping Pages:  93%|█████████▎| 149/160 [00:53<00:02,  4.34it/s]

Scraping Page:  http://library.sjsu.edu/


Scraping Pages:  94%|█████████▍| 151/160 [00:54<00:02,  3.58it/s]

Scraping Page:  https://blogs.sjsu.edu/newsroom/


Scraping Pages:  95%|█████████▌| 152/160 [00:54<00:01,  4.36it/s]

Scraping Page:  https://www.sjsu.edu/sjsucares/resources/parenting-students.php
Scraping Page:  https://www.sjsu.edu/parking/index.php


Scraping Pages:  96%|█████████▋| 154/160 [00:54<00:01,  4.41it/s]

Scraping Page:  https://www.sjsu.edu/contact/index.php
Scraping Page:  http://directory.sjsu.edu/


Scraping Pages:  97%|█████████▋| 155/160 [00:54<00:01,  4.17it/s]

Scraping Page:  https://www.sjsu.edu/about/doing-business-with-sjsu.php
Scraping Page:  http://www.sjsu.edu/emergency


Scraping Pages:  98%|█████████▊| 157/160 [00:55<00:00,  5.00it/s]

Scraping Page:  https://www.sjsu.edu/up/resources/file-a-complaint.php


Scraping Pages:  99%|█████████▉| 159/160 [00:55<00:00,  4.93it/s]

Scraping Page:  https://www.sjsu.edu/titleixeo/make-a-report/index.php
Scraping Page:  https://a.cms.omniupdate.com/11/?skin=sjsu&account=sjsu&site=www&action=de&path=/index.php


Scraping Pages: 100%|██████████| 160/160 [00:56<00:00,  2.85it/s]


In [53]:
def clean_empty_lines(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as infile:
        lines = infile.readlines()

    # Remove empty lines and strip whitespace
    cleaned_lines = [line.strip() for line in lines if line.strip()]

    with open(output_file, "w", encoding="utf-8") as outfile:
        for line in cleaned_lines:
            outfile.write(line + "\n")


clean_empty_lines("scraped_pages_content.txt", "filtered_data.txt")

In [57]:
import re

# Load the text data
with open("scraped_pages_content.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Step 1: Remove boilerplate content by identifying common navigational patterns
patterns = [
    r"Skip to main content",
    r"Search",
    r"Visit",
    r"Apply",
    r"Give",  # Common phrases
    r"SJSU on [A-Za-z]+",
    r"SJSU Online",  # Social media links
    r"Footer",
    r"Last Updated .*",  # Footer and Last updated information
]
for pattern in patterns:
    text = re.sub(pattern, "", text)

# Step 2: Remove excessive whitespace
text = re.sub(r"\n\s*\n", "\n", text)  # Reduce multiple newlines to a single newline

# Step 3: Remove repetitive sections
# Example: We could split sections by keywords and check for duplicates
unique_sections = list(set(text.split("\n\n")))  # Split and deduplicate sections
cleaned_text = "\n\n".join(unique_sections)

# Step 4: Save the cleaned text
with open("cleaned_scraped_contentL1.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_text)

In [58]:
import re


def clean_noisy_data(text):
    # Define a regex pattern to match unwanted characters
    # This pattern removes non-printable characters and any other specified noise
    cleaned_text = re.sub(
        r"[^\x20-\x7E]+", "", text
    )  # Keep only printable ASCII characters
    return cleaned_text


def clean_file(input_file, output_file):
    with open(input_file, "r", encoding="utf-8", errors="ignore") as infile:
        lines = infile.readlines()

    cleaned_lines = [clean_noisy_data(line) for line in lines]

    with open(output_file, "w", encoding="utf-8") as outfile:
        for line in cleaned_lines:
            if line.strip():  # Only write non-empty lines
                outfile.write(line + "\n")


# Usage
clean_file("cleaned_scraped_contentL1.txt", "cleaned_scraped_contentL2.txt")