In [None]:
%load_ext lab_black

In [None]:
# | default_exp moz_combine_competitors

In [None]:
# | export

from pathlib import Path
import pandas as pd
import requests


pd.set_option("display.width", 1000)

DATA_DIR = "../data/"

dfs = []
our_sites = []
for f in Path(DATA_DIR).glob("Competitive Research_ True Competitor - Moz Pro*.csv"):
    fname = f.name
    site = fname.split()[-1][4:-4]
    our_sites.append(site)
    df = pd.read_csv(f, encoding="utf-8")
    df["Competitor"] = site
    dfs.append(df)

df = pd.concat(dfs)

# Convert the percentage column to a float
df["Overlap"] = df["Overlap"].str.strip().str.rstrip("%").astype(float) / 100

# Filter to sites that have at least 2% keyword overlap
df = df[df["Overlap"] > 0.02]

# Sort by where there's the most overlap
df.sort_values("Overlap", ascending=False, inplace=True)

competitors = set(df["Top Competitor URLs"].unique())


def get_final_redirect(apex_domain):
    url = f"https://{apex_domain}"
    try:
        response = requests.head(url, allow_redirects=True)
        return response.url
    except:
        return None


def chop_protocol(url):
    """
    Takes a URL as input and chops off either the secure or non-secure protocol
    from the beginning of the URL, returning the result.
    """
    # Define the protocols we want to remove
    protocols = ["http://", "https://"]

    # Loop through each protocol and remove it if it exists
    for protocol in protocols:
        if url.startswith(protocol):
            url = url[len(protocol) :]
            if url[-1] == "/":
                url = url[:-1]
            break

    return url


our_resolved_sites = set()
for site in our_sites:
    landing_page = get_final_redirect(site)
    chopped_protocol = chop_protocol(landing_page)
    our_resolved_sites.add(chopped_protocol)

# Show original list lengths
print(f"Seed Sites: {len(our_resolved_sites)}")
print(f"Competitors: {len(competitors)}")

# Show the combined lists
all_sites = competitors | our_resolved_sites
print(f"Combined before fitlering: {len(all_sites)}")

# Show the combined lists after we removed blacklisted sites
pull_sites = all_sites - black_list
print(f"Combined after fitlering: {len(pull_sites)}")

# Save the file in the data directory
save_as = DATA_DIR + "pull_keywords.txt"
with open(save_as, "w") as fh:
    for site in pull_sites:
        fh.write(site + "\n")
print(f"{save_as} saved!")

In [None]:
pull_sites

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()