In [None]:
# Install required libraries

# !pip install -q goose3
# !pip install -q spacy
# !pip install -q spacy-transformers
# !python3 -m spacy download en_core_web_trf
# !pip install -q google

In [20]:
from goose3 import Goose
import pandas as pd
from googlesearch import search
import spacy
from spacy import displacy


In [13]:
def create_search_query(risk_terms, companies_list):
    """
    Creates search queries from dataframes of risk terms and companies terms
    Returns dataframe of Risk Terms, Companies and the corresponding search queries
    Parameters
    ----------
    risk_terms : DataFrame
    companies_list : DataFrame

    Returns
    -------
    search_queries : Dataframe
    """
    search_queries = risk_terms.merge(companies_list, how="cross")
    search_queries["Search Query"] = (
        search_queries["Companies"] + " " + search_queries["Risk Terms"]
    )
    return search_queries


def get_url(query):
    """Fetch URLs from Google Search

    Args:
        query (string): Search Query

    Returns:
       url (string): URL of search result.
    """
    for url in search(query, tld="co.in", num=1, stop=1, pause=2):
        urls = url
    return urls


def goose_text_extraction(url):
    """
    Extract text from given URL

    Parameters
    ----------
    url: string
      URL of the article from which the text needs to be extracted

    Returns
    -------
    cleaned_text: string
      Text that has been extracted from the URL
    """
    g = Goose()
    article = g.extract(url)
    cleaned_text = article.cleaned_text
    return cleaned_text


In [None]:
# Importing data
companies = pd.DataFrame(["Twitter", "Meta", "Amazon"], columns=["Companies"])
risk_terms = pd.DataFrame(["Layoffs", "Harrassment", "Fraud"], columns=["Risk Terms"])

# Creating Search Queries
risk_queries = create_search_query(risk_terms, companies)


In [15]:
# Getting urls for search queries
risk_queries["url"] = risk_queries["Search Query"].apply(get_url)

# Extract text from the URLS
risk_queries["Text"] = risk_queries["url"].apply(goose_text_extraction)


In [17]:
risk_queries.head(20)

Unnamed: 0,Risk Terms,Companies,Search Query,url,Text
0,Layoffs,Twitter,Twitter Layoffs,https://fortune.com/2022/11/18/twitter-former-...,"When Twitter’s new owner, Elon Musk, decided t..."
1,Layoffs,Meta,Meta Layoffs,https://www.vox.com/recode/2022/11/17/23463164...,A wave of significant layoffs is crashing acro...
2,Layoffs,Amazon,Amazon Layoffs,https://www.channelnewsasia.com/business/amazo...,Jassy added the company was in the middle of a...
3,Harrassment,Twitter,Twitter Harrassment,https://help.twitter.com/en/rules-and-policies...,Twitter Rules: You may not engage in the targe...
4,Harrassment,Meta,Meta Harrassment,https://www.technologyreview.com/2021/12/16/10...,"Last week, Meta (the umbrella company formerly..."
5,Harrassment,Amazon,Amazon Harrassment,https://www.aboutamazon.com/news/workplace/fos...,"At Amazon, we work hard to foster a work envir..."
6,Fraud,Twitter,Twitter Fraud,https://help.twitter.com/en/rules-and-policies...,You may not use Twitter’s services in a manner...
7,Fraud,Meta,Meta Fraud,https://algorithmwatch.org/en/meta-sued-for-ab...,"In March, the Australian Competition and Consu..."
8,Fraud,Amazon,Amazon Fraud,https://www.amazon.com/gp/help/customer/displa...,


In [18]:
# Saving Extracted Text and URLs
risk_queries.to_csv("risk_queries.csv", index=False)


In [None]:
nlp = spacy.load("en_core_web_trf")


def ner_get_entities(raw_text, ner_entities, org_entities, span_start, span_end):
    """
    Generate named entities given text.

    Parameters
    ----------
    raw_text : string
        String on which NER is applied.
    ner_entities : tuple
        List of named entities that were extracted.
    org_entities : list
        List of named organisation entities.
    span_start : list
        List of index of start locations of all named entities.
    span_end : list
        List of index of end locations of all named entities.

    Returns
    -------
    list
        Named Entities
    """
    text_ner = nlp(raw_text)

    if ner_entities:
        return text_ner

    organization = []
    start_char_org = []
    end_char_org = []

    for word in text_ner.ents:
        if word.label_ == "ORG":
            organization.append(word.text)
            start_char_org.append(word.start_char)
            end_char_org.append(word.end_char)

    if org_entities:
        return organization

    if span_start:
        return start_char_org

    if span_end:
        return end_char_org


In [23]:
risk_queries["NER"] = risk_queries["Text"].apply(
    ner_get_entities, args=(True, False, False, False)
)
risk_queries["Organization"] = risk_queries["Text"].apply(
    ner_get_entities, args=(False, True, False, False)
)
risk_queries["Start"] = risk_queries["Text"].apply(
    ner_get_entities, args=(False, False, True, False)
)
risk_queries["End"] = risk_queries["Text"].apply(
    ner_get_entities, args=(False, False, False, True)
)


In [24]:
risk_queries

Unnamed: 0,Risk Terms,Companies,Search Query,url,Text,NER,Organization,Start,End
0,Layoffs,Twitter,Twitter Layoffs,https://fortune.com/2022/11/18/twitter-former-...,"When Twitter’s new owner, Elon Musk, decided t...","(When, Twitter, ’s, new, owner, ,, Elon, Musk,...","[Twitter, Twitter, Musk, Fortune, American Exp...","[5, 870, 954, 1002, 1210, 1361, 1388, 2045, 20...","[12, 877, 958, 1009, 1226, 1366, 1395, 2052, 2..."
1,Layoffs,Meta,Meta Layoffs,https://www.vox.com/recode/2022/11/17/23463164...,A wave of significant layoffs is crashing acro...,"(A, wave, of, significant, layoffs, is, crashi...","[Meta, Facebook, Amazon, Lyft, Robinhood, Stri...","[66, 130, 140, 219, 225, 236, 244, 253, 641, 6...","[70, 138, 146, 223, 234, 242, 251, 261, 645, 6..."
2,Layoffs,Amazon,Amazon Layoffs,https://www.channelnewsasia.com/business/amazo...,Jassy added the company was in the middle of a...,"(Jassy, added, the, company, was, in, the, mid...","[Jassy, Amazon]","[0, 162]","[5, 168]"
3,Harrassment,Twitter,Twitter Harrassment,https://help.twitter.com/en/rules-and-policies...,Twitter Rules: You may not engage in the targe...,"(Twitter, Rules, :, You, may, not, engage, in,...","[Twitter, Twitter, Twitter, Twitter]","[0, 211, 3095, 3510]","[7, 218, 3102, 3517]"
4,Harrassment,Meta,Meta Harrassment,https://www.technologyreview.com/2021/12/16/10...,"Last week, Meta (the umbrella company formerly...","(Last, week, ,, Meta, (, the, umbrella, compan...","[Meta, Facebook, Meta, Meta, Facebook, Meta, H...","[11, 56, 424, 565, 656, 667, 1090, 1161, 1547,...","[15, 64, 428, 569, 664, 671, 1097, 1170, 1551,..."
5,Harrassment,Amazon,Amazon Harrassment,https://www.aboutamazon.com/news/workplace/fos...,"At Amazon, we work hard to foster a work envir...","(At, Amazon, ,, we, work, hard, to, foster, a,...","[Amazon, Amazon, Amazon, Glassdoor, Twitter, I...","[3, 837, 1014, 1084, 1095, 1108, 1187, 2956, 3...","[9, 843, 1020, 1093, 1102, 1117, 1193, 2962, 3..."
6,Fraud,Twitter,Twitter Fraud,https://help.twitter.com/en/rules-and-policies...,You may not use Twitter’s services in a manner...,"(You, may, not, use, Twitter, ’s, services, in...","[Twitter, Twitter, Twitter, Twitter, Twitter, ...","[16, 178, 196, 322, 554, 1802, 1899, 1976, 2036]","[23, 185, 203, 329, 561, 1809, 1906, 1983, 2043]"
7,Fraud,Meta,Meta Fraud,https://algorithmwatch.org/en/meta-sued-for-ab...,"In March, the Australian Competition and Consu...","(In, March, ,, the, Australian, Competition, a...",[the Australian Competition and Consumer Commi...,"[10, 62, 97, 118, 141, 330, 751, 1237, 1424, 1...","[60, 66, 105, 122, 145, 338, 759, 1241, 1428, ..."
8,Fraud,Amazon,Amazon Fraud,https://www.amazon.com/gp/help/customer/displa...,,(),[],[],[]


In [27]:
displacy.render(risk_queries.iloc[5, 5], style="ent", jupyter=True)