In [None]:
# BeautifulSoup get visible text from webpages (unreliable)

from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request

# indicator of whether element is visible based on tag
# this is just a guess and is not consistent across all sites
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

# func to get text given html
def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

urls = ['http://www.nytimes.com/2009/12/21/us/21storm.html','https://www.youtube.com/']

# overall func to get text given list of urls
def getText1(urls):
    texts = []
    for url in urls:
        html = urllib.request.urlopen(url).read()
        texts.append(text_from_html(html))
    return texts

print(getText1(urls)[0]) # works well for news site
print(getText1(urls)[1]) # works terribly for youtube

In [None]:
# BeautifulSoup get visible text from webpages (written more efficiently) (unreliable)

urls = ['http://www.nytimes.com/2009/12/21/us/21storm.html','https://www.youtube.com/']

def getText2(urls):
    texts = []
    for url in urls:
        html = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(html)
        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        texts.append(soup.getText())
    return texts

print(getText2(urls))
# extracted text isnt as nice as getText1

In [None]:
# selenium get text
from selenium import webdriver
exec_path = r"C:\Users\JoshAlder\OneDrive - Principle One\Documents\VS Code\URLs Project New\Rando\chromedriver.exe"
driver = webdriver.Chrome(executable_path=exec_path)
driver.get("https://www.bbc.co.uk/news/business-58830955")
driver.get('https://www.youtube.com/')
el = driver.find_element_by_tag_name('body').text
print(el)
# driver.close()
# This script cant get past sign in pop up whereas BS seems to be able to (NYTimes Article)

In [6]:
# scrapy (faster than other libraries)
# selectors - still need to specify tags & locations in HTML?
import scrapy

class TestSpider(scrapy.Spider):
    name = 'test'
    start_urls = ["https://www.bbc.co.uk/news/business-58830955"]

    def parse(self,response):
        links = response.xpath('//body//text()')
        html = ""
        for link in links:
            url = links.get()
            



In [None]:
# inscriptis get text - works well for lots of sites but struggles with youtube for some reason
import urllib.request 
from inscriptis import get_text 
 
urls = ["https://www.amazon.co.uk/introducing-fire-tv-stick-lite-with-alexa-voice-remote-lite-no-tv-controls-2020-release/dp/B07ZZW7QCM/?_encoding=UTF8&pd_rd_w=momYP&pf_rd_p=7b33cd3c-1a87-4db8-9f31-ba1adc449805&pf_rd_r=V3TTH21TYKK6REAYQ6TR&pd_rd_r=bc8c40c9-546b-4c63-8b4a-29fd04e2f958&pd_rd_wg=06pzc&ref_=pd_gw_unk","https://www.bbc.co.uk/news/business-58830955"]
html = urllib.request.urlopen(urls[0]).read().decode('utf-8')
text = get_text(html)

print(text) 

In [None]:
# Cloudscraper (can apparently get around protection? particularly cloudflare)

In [None]:
# Google API website categorisation

In [None]:
# Get Meta - Selenium/BS






# Youtube Scraping Tests
import json
from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.parse import urlparse

urls = ['https://www.youtube.com/watch?v=dQw4w9WgXcQ']
response = requests.get(urls[0]) # send GET request to url
soup = BeautifulSoup(response.text, 'html.parser') # parse the text given by request
vidTitle = soup.title.text
print(soup.prettify())
# print(soup.find(class='style-scope ytd-video-primary-info-renderer')) 



In [None]:
# Youtube Detailed Scraping 

from bs4 import BeautifulSoup
import pandas as pd
import requests
from urllib.parse import urlparse
import numpy as np

urls = ['https://www.youtube.com/watch?v=dQw4w9WgXcQ']

#provider_details = pd.DataFrame(columns=['company_name','company_link', 'company_number', 'company_email'])

for url in urls:    
    # scrape page
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    providers_table = soup.find('table', {'class', 'govuk-table'}).find('tbody')

    url_provider_details = pd.DataFrame(columns=['company_name', 'company_link', 'company_number', 'company_email'])
    for row in providers_table.find_all('tr'):
        # test provider saved in cell with id 'provider'
        provider = row.find(id='provider').find('a')
        name = provider.get_text().rstrip().lower()
        link = provider['href']
    
        # test provider number and email saved in only cell(s) with no id
        number_email = row.find_all('td', id=None)
        
        # remove web archive prefix from wayback machine
        link = link[43:] if 'web.archive.org' in link else link

        # old format stores number and email in separate cells
        # new format stores number and email in same cell
        if len(number_email) == 1:
            number_email = number_email[0].find_all('a')
        number = str(number_email[0].get_text())
        email = number_email[1].get_text()
        
        # apply standard format to numbers, emails and links
        if number and len(number) >= 10:
            number = ' '.join(number.rstrip().split())
            number = phonenumbers.format_number(phonenumbers.parse(number, 'GB'), phonenumbers.PhoneNumberFormat.INTERNATIONAL)
        else:
            number = np.nan 

        if email:
            email = str(email).lower()

        if link:
            link = urlparse(link.lower()).netloc

        url_provider_details = url_provider_details.append({
            'company_name': name,
            'company_link': link,
            'company_number': number,
            'company_email': email
        }, ignore_index=True)

    provider_details = pd.merge(
        provider_details, 
        url_provider_details, 
        how="outer", 
        on=['company_name','company_link','company_number','company_email']
    )

provider_details.to_csv('datasets/provider_details.csv')

In [None]:
# YouTube pop-up closing

In [None]:
# Traceback / indexing
# add screenshot metadata?

In [None]:
# Tor URL processing

In [None]:
# url matching
# maybe match from a general list, which can be the url list itself (duplicates) or white/red lists?

In [None]:
# URL prioritisation and filtering
# time and no. visits will be given in the history excel document obtained by police from devices