## This is for testing the WebScraping class

In [4]:
import requests, random, Universities
from urllib.parse import urlparse
from UserAgents import UserAgents
from User import User
from bs4 import BeautifulSoup

class WebScraping:
    bs4 = None
    def __init__(self):
        self.linkFilterPrefixes = ["/search", "q=", "/?", "/advanced_search"]
        bs4 = BeautifulSoup()

    def initial_search(self, user: User):
        user_name = "+".join(user.name)
        search_url = f"https://www.google.com/search?q=%22{user_name}%22"
        req = self.request(search_url)
        bs = BeautifulSoup(req.content, 'html.parser')

        # Select every single <a> element
        raw_links = bs.select("a")
        # Filter links that do not contain "google.com" or start with the prefixes defined.
        links = [link['href'] for link in raw_links if not any(link['href'].startswith(prefix) 
                    or link['href'].find('google.com') > 0 for prefix in self.linkFilterPrefixes)] 
        
        # Only grab the relevent part of the link if it includes more in it
        links = [link.split("/url?q=")[-1] for link in links]
        user.initial_search_links = links
        return links
    
    '''
    Verify if the link is relevent to the researcher. 2/3 is required to be used.
    1. First checks if the institution can be found on the page text.
    2. Checks if the researchers name can be found on the page check.
    3. Check if the URL is from their institution.
    '''
    def verify_link_relevancy(self, link: str, page_data: str, user: User):
        page_data = page_data.lower()
        user_name = "+".join(user.name).lower()
        checks = 0
        reason = ""

        # Check 1
        if page_data.find(user.institution.lower()):
            checks += 1
            reason += "Instituion found | "

        # Check 2
        if page_data.find(user_name):
            checks += 1
            reason += "Researcher name found | "
        
        # Check 3
        if Universities.findUniversityLink(user.institution).find(link) > -1:
            checks += 1
            reason += "University website verified"
        return (checks >= 2, checks, reason)
        
    ''' 
    Scrape the webpage and get the webtext without HTML tags
    then check verify the source is reputable by a 3 part check method
    '''
    def scrape_webpage(self, link: str, user: User):
        # Request the page and convert to BS4
        req = self.request(link)
        bs = BeautifulSoup(req.content, 'html.parser')
        
        # Grab only the webtext (text without HTML tags)
        webtext = bs.get_text()
        
        # Parse the URL so that we can only get the base domain
        parsed_url = urlparse(link)
        domain_parts = parsed_url.netloc.split('.')
        domain = '.'.join(domain_parts[-2:])

        # Do a 3 part check on the domain, webtext, and the user to verify it pertains to the user
        verified, check, reason = self.verify_link_relevancy(domain, webtext, user)

        print(verified, check, reason)
        
    
    '''
    Internal request method that faciliates parameters and headers
    :return: `Response`
    '''
    def request(self, link) -> requests.Response:
        return requests.get(link, self.genHeaders())

    '''
    Generate new headers
    '''
    def genHeaders(self) -> dict:
        return {
        'User-agent': self.getRandAgent()
        }

    ''' 
    Returns a random UserAgent for the headers
    '''
    def getRandAgent(self) -> str:
        return UserAgents[random.randrange(len(UserAgents))]


<center><b>Test with researcher Zheng Xiang from VT</b></center>

In [5]:
ws = WebScraping()
zheng = User("Zheng Xiang", "Virginia Tech")
ws.initial_search(zheng)
zheng.initial_search_links

ws.scrape_webpage(zheng.initial_search_links[0], zheng)



True 3 Instituion found | Researcher name found | University website verified


In [137]:
links = bs.select("a")
prefixes = ["/search", "q=", "/?", "/advanced_search"]
links = [link['href'] for link in links if not any(link['href'].startswith(prefix) or link['href'].find('google.com') > 0 for prefix in prefixes)] 
links = [link.split("/url?q=")[-1] for link in links]
links

['https://htm.pamplin.vt.edu/directory/xiang.html&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoECAAQAg&usg=AOvVaw0Lz9onYd8Yi73KZNoBU9rG',
 'https://www.researchgate.net/profile/Zheng-Xiang-6&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoECAkQAg&usg=AOvVaw3Pl-LDkyxmws9kkKHwW1w8',
 'https://www.journals.elsevier.com/tourism-management/editorial-board/zheng-xiang&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoECAYQAg&usg=AOvVaw3_99UIBvWoLFAoVm_SyS3R',
 'https://ischool.wisc.edu/blog/staff/zheng-xiang/&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoECAcQAg&usg=AOvVaw2v9r81jX-cWTy-flpZ0b3V',
 'https://msuspartans.com/staff-directory/zheng-xiang/856&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoECAMQAg&usg=AOvVaw3MVZM9M_4xBNIlqOGNlf6E',
 'https://goutsa.com/staff-directory/zheng-xiang/91&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoECAIQAg&usg=AOvVaw2uk9-AhvHQuqMIm-fE33wZ',
 'https://riceowls.com/sports/womens-volleyball/roster/coaches/zheng-xiang/473&sa=U&ved=2ahUKEwiXrYSP4Nf9AhUhFFkFHe90CkYQFnoE

In [26]:
from urllib.parse import urlparse

url = 'https://htm.pamplin.vt.edu/directory/xiang.html&sa=U&ved=2ahUKEwj7sYqd_Nf9AhUkFVkFHXX6Bw8QFnoECAkQAg&usg=AOvVaw2sC1rEKB_bDYW5IiSR_gu4'
parsed_url = urlparse(url)
print(parsed_url)
domain_parts = parsed_url.netloc.split('.')
domain = '.'.join(domain_parts[-2:])
domain

ParseResult(scheme='https', netloc='htm.pamplin.vt.edu', path='/directory/xiang.html&sa=U&ved=2ahUKEwj7sYqd_Nf9AhUkFVkFHXX6Bw8QFnoECAkQAg&usg=AOvVaw2sC1rEKB_bDYW5IiSR_gu4', params='', query='', fragment='')
