# Market Research Hack

In [1]:
import fnmatch
import itertools
import logging
import random
import sys
import time
from urllib.parse import quote_plus, urlparse

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

logging.basicConfig(
    level=logging.INFO,
    format="{asctime} {levelname}: {funcName} - {message}",
    style="{",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)


## Load Original Datasets

In [2]:
df_companies = pd.read_excel("./data/Companies list.xlsx", header=None)
df_DS_keywords = pd.read_excel(
    "./data/Hackathon_Market research_keywords.xlsx", sheet_name="DS"
)
df_DS_keywords.replace({np.nan: ""}, inplace=True)
df_DP_keywords = pd.read_excel(
    "./data/Hackathon_Market research_keywords.xlsx", sheet_name="DP"
)
df_DP_keywords.replace({np.nan: ""}, inplace=True)
df_AD_keywords = pd.read_excel(
    "./data/Hackathon_Market research_keywords.xlsx", sheet_name="AD"
)
df_AD_keywords.replace({np.nan: ""}, inplace=True)


## Prepare Utilities and Variables

In [3]:
def bing_search(query: str, user_agent: str, page: int = 0):
    logging.info(query)
    BASE_URL = "https://www.bing.com/search?q="
    headers = {"user-agent": user_agent}
    request = requests.get(
        f"{BASE_URL}{quote_plus(query)}&first={(page * 10) + 1}",
        headers=headers,
        timeout=10,
    )

    results = list()
    if request.status_code == 200:
        soup = BeautifulSoup(request.content, "html.parser")
        try:
            for x in soup.find_all("li", {"class": "b_algo"}):
                url = x.find_all("a")[0]["href"]
                if url.startswith("https://www.bing.com/ck/"):
                    logging.debug("Bing redirect, trying to resolve it")
                    # get redirect
                    try:
                        intermediate = requests.get(url, headers=headers, timeout=10)
                        if intermediate.status_code == 200:
                            redirect_soup = BeautifulSoup(
                                intermediate.content, "html.parser"
                            )
                            redirect = requests.get(
                                redirect_soup.find_all("a")[0]["href"],
                                headers=headers,
                                timeout=10,
                            )
                            if hasattr(redirect, "url"):
                                url = redirect.url
                                logging.debug(f"Successfully resolved into {url}")
                            else:
                                logging.error(
                                    f"Couldn't resolve redirect, HTTP Status while resolving a redirect, keeping redirect url: {intermediate.status_code}"
                                )
                        else:
                            logging.warning(
                                f"HTTP Status while resolving a redirect, keeping redirect url: {intermediate.status_code}"
                            )
                    except Exception as ex:
                        logging.warning(
                            f"Exception while resolving a redirect, keeping redirect url: {ex}"
                        )
                results.append(url)
        except Exception as ex:
            logging.error(f"Exception: {ex}")
    else:
        logging.error(f"HTTP Status: {request.status_code}")

    return results


n_pages = 5
user_agent = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0"
)

companies = df_companies[1].to_list()

df_DS_keywords["group"] = "drug substance"
df_DP_keywords["group"] = "drug package"
df_AD_keywords["group"] = "analytical development"
keywords = list()
for _, x in itertools.chain(
    df_DS_keywords.iterrows(), df_DP_keywords.iterrows(), df_AD_keywords.iterrows()
):
    tmp = x.to_dict()
    keywords.append(
        dict(
            keyword=tmp["keyword"],
            synonyms=[y.strip() for y in tmp["synonyms"].split(",") if y.strip()],
            hyponyms=[y.strip() for y in tmp["hyponyms"].split(",") if y.strip()],
            group=tmp["group"],
        )
    )

blacklist = list()
# blacklisted domains MUST NOT have http:// or https:// before the FQDN
# e.g. wikipedia.com, www.wikipedia.com, www.ncbi.nlm.nih.gov
# you can also use wildcards
# e.g. if you have "www.google.*" in the blacklist you will exclude every domain starting with "www.google." so "www.google.it", "www.google.com", "www.google.es", etc.
try:
    with open("domain_blacklist.txt", "r") as f:
        blacklist = f.read().splitlines()
except FileNotFoundError:
    logging.warning("domain_blacklist.txt is missing")

if not blacklist:
    logging.info("domain_blacklist.txt is empty")


## Company homepage domain search

### Actual Code

In [4]:
scraped_websites = dict(
    drug_substance=set(), drug_package=set(), analytical_development=set()
)

for x in keywords:
    query = (
        "("
        + x["keyword"]
        + ((" OR " + " OR ".join(x["synonyms"])) if x["synonyms"] else "")
        + ") prefer:company prefer:pharma "
        + " ".join([f"prefer:{group_word}" for group_word in x["group"].split()])
        + " language:en"
    )
    # go through the first X pages of results
    for i in range(n_pages):
        # sleep random time to avoid being detected
        time.sleep(random.randrange(5, 15))
        scraped_websites[x["group"].replace(" ", "_")].update(
            [url for url in bing_search(query=query, user_agent=user_agent, page=i)]
        )

df_domains = pd.DataFrame()
for group, websites in scraped_websites.items():
    filtered_websites = [
        dict(domain=urlparse(url=url).netloc, url=url) for url in websites
    ]
    # exclude blacklisted domains
    for blacklisted_domain in blacklist:
        filtered_websites = [
            x
            for x in filtered_websites
            if not fnmatch.fnmatch(x["domain"], blacklisted_domain)
        ]

    df = pd.DataFrame(
        dict(
            domain=list(map(lambda x: x["domain"], filtered_websites)),
            url=list(map(lambda x: x["url"], filtered_websites)),
        )
    )
    df.sort_values(by=["domain", "url"], inplace=True)
    df_domains = df_domains.append(df)
    df.to_csv(f"domains_{group}.csv", index=False)

df_domains.sort_values(by=["domain", "url"], inplace=True)
df_domains.reset_index(drop=True, inplace=True)
df_domains.to_csv("domains.csv", index=False)


2022-05-08 17:26:59 INFO: bing_search - (molecule stability OR product stability) prefer:company prefer:pharma prefer:drug prefer:substance language:en
2022-05-08 17:27:22 INFO: bing_search - (molecule stability OR product stability) prefer:company prefer:pharma prefer:drug prefer:substance language:en
2022-05-08 17:27:43 INFO: bing_search - (molecule stability OR product stability) prefer:company prefer:pharma prefer:drug prefer:substance language:en
2022-05-08 17:27:58 INFO: bing_search - (molecule stability OR product stability) prefer:company prefer:pharma prefer:drug prefer:substance language:en
2022-05-08 17:28:26 INFO: bing_search - (molecule stability OR product stability) prefer:company prefer:pharma prefer:drug prefer:substance language:en
2022-05-08 17:28:50 INFO: bing_search - (downstream process development) prefer:company prefer:pharma prefer:drug prefer:substance language:en
2022-05-08 17:29:18 INFO: bing_search - (downstream process development) prefer:company prefer:ph

## Keyword extraction 

In [None]:
domains = pd.read_csv("domains.csv")

# TODO - PERFORM ACTUAL KEYWORD EXTRACTION
df = pd.DataFrame(
    {
        "keywords": [["molecules", "gene"], ["psyshiary", "health"]],
        "company_name": ["lonza", "Boehringer Ingelheim"],
    },
    index=domains["domain"],
)
df.index.name = "domain"

df.to_csv("extracted_keywords.csv")
df


Unnamed: 0_level_0,keywords,company_name
domain,Unnamed: 1_level_1,Unnamed: 2_level_1
https://www.lonza.com/,"[molecules, gene]",lonza
https://www.boehringer-ingelheim.com/,"[psyshiary, health]",Boehringer Ingelheim


## Area of works classification

In [None]:
keywords = pd.read_csv("extracted_keywords.csv")


def classifier_predict(keywords):
    # TODO - return actual trained-classifier predictions
    return {"ad": 0.2, "ds": 0.2, "dp": 0.6}


predictions = keywords["keywords"].map(lambda keywords: classifier_predict(keywords))

df = pd.DataFrame({"predictions": predictions.to_list()}, index=domains["domain"])
df.index.name = "domain"

df.to_csv("area_of_works_predictions.csv")

df


Unnamed: 0_level_0,predictions
domain,Unnamed: 1_level_1
https://www.lonza.com/,"{'ad': 0.2, 'ds': 0.2, 'dp': 0.6}"
https://www.boehringer-ingelheim.com/,"{'ad': 0.2, 'ds': 0.2, 'dp': 0.6}"


## Interactive user interface 

In [None]:
keywords = pd.read_csv("extracted_keywords.csv")
area_of_works_predictions = pd.read_csv("area_of_works_predictions.csv")


"""
TODO
show a UI with buttons with functionality for flagging domains (which writes to domains_blacklist.txt) and
flagging kywords (which writes to keywords_blacklist.txt)
"""
pd.merge(keywords, area_of_works_predictions)


Unnamed: 0,domain,keywords,company_name,predictions
0,https://www.lonza.com/,"['molecules', 'gene']",lonza,"{'ad': 0.2, 'ds': 0.2, 'dp': 0.6}"
1,https://www.boehringer-ingelheim.com/,"['psyshiary', 'health']",Boehringer Ingelheim,"{'ad': 0.2, 'ds': 0.2, 'dp': 0.6}"
