<a href="https://colab.research.google.com/github/searchsolved/search-solved-public-seo/blob/main/Keyword_Clustering_Tool/Keyword_Clustering_Tool_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

January 2022

More like this: https://searchsolved.co.uk/blog/

# Keyword Clustering Tool V3
Automatic keyword clustering tool. See yours and your competitors most profitable keyword clusters in a couple of clicks.

# Quick Start Instructions
Run all the cells and upload a CSV export.
Runtime > Run All (Control + F9)

# Works with the Following Exports out the Box

*   Ahrefs.com (Keyword Export / Site Explorer Export)
*   SEMRush.com
*   Search Console (Coverage Report CSV Export (Queries.csv))
*   AdWords Search Terms Report .csv or Excel format (Beta)
*   A simple single column .txt / csv file with keywords (Header or Headerless)

# File Formats
*   utf-8/utf-16/csv/xls/xlsx/xlsm/xlsb/odf/ods/odt

In [None]:
!pip install pandas
!pip install polyfuzz[fast]
!pip install chardet
!pip install tqdm



In [None]:
import pandas as pd
import sys
from google.colab import files
from polyfuzz import PolyFuzz
import chardet
from tqdm import tqdm
import os

In [None]:
# rename the parent cluster name using the keyword with the highest search volume (recommended)
parent_by_vol = True
drop_site_links = False
drop_image_links = False
sim_match_percent = 1
url_filter = ""
min_volume = 0  # set the minimum search volume / impressions to filter on

# Add Custom Intent Classifiers
You Can Add Custom Words to Classify Keywords with Below. e.g. 'vs' = Commercial Investigation, 'how' = 'informational' etc. 

Please see here: https://blog.travelpayouts.com/en/search-intent/ 

In [None]:
info_filter = "what|where|why|when|who|how|which|tip|guide|tutorial|ideas|example|learn|wiki|in mm|in cm|in ft|in feet"
comm_invest_filter = "best|vs|list|compare|review|list|top|difference between"
trans_filter = "purchase|bargain|cheap|deal|value|closeout|buy|shop|price|coupon|discount|price|pricing|delivery|shipping|order|returns|sale|amazon|target|ebay|walmart|cost of|how much"

# Upload Your Keyword File Here!

In [433]:
# upload the keyword export
upload = files.upload()
input_file = list(upload.keys())[0]  # get the name of the uploaded file
# test the file extension
file_extension = os.path.splitext(input_file)

Saving google_gb_socks_matching-terms_2022-01-08_11-42-36.csv to google_gb_socks_matching-terms_2022-01-08_11-42-36 (2).csv


In [458]:
# ---------------------------------- auto detect character encoding ----------------------------------------------------

with open(input_file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# if the encoding is utf-16 use a space separator, else ','
if result['encoding'] == "UTF-16":
    white_space = True
else:
    white_space = False

if (
    file_extension[1] == ".xlsx"
    or file_extension[1] == ".xls"
    or file_extension[1] == ".xlsm"
    or file_extension[1] == ".xlsb"
    or file_extension[1] == ".odf"
    or file_extension[1] == ".ods"
    or file_extension[1] == ".odt"
):
    df_1 = pd.read_excel(input_file, engine="openpyxl")
else:
    try:
        df_1 = pd.read_csv(
            input_file,
            encoding=result["encoding"],
            delim_whitespace=white_space,
            error_bad_lines=False,
        )
    # fall back to utf-8
    except UnicodeDecodeError:
        df_1 = pd.read_csv(
            input_file,
            encoding="utf-8",
            delim_whitespace=white_space,
            error_bad_lines=False,
        )


In [459]:
# -------------------------- check if single column import / and write header if missing -------------------------------

# check the number of columns
col_len = len(df_1.columns)
col_name = df_1.columns[0]

if col_len == 1 and df_1.columns[0] != "Keyword":
    df_1.columns = ["Keyword"]

if col_len == 1 and df_1.columns[0] != "keyword":
    df_1.columns = ["Keyword"]


In [460]:
# -------------------------- detect if import file is adwords and remove the first two rows ----------------------------
adwords_check = False
if col_name == "Search terms report":
    df_1.columns = df_1.iloc[1]
    df_1 = df_1[1:]
    df_1 = df_1.reset_index(drop=True)

    new_header = df_1.iloc[0]  # grab the first row for the header
    df_1 = df_1[1:]  # take the data less the header row
    df_1.columns = new_header  # set the header row as the df header
    adwords_check = True

In [461]:
# --------------------------------- Check if csv data is gsc and set bool ----------------------------------------------

if 'Impressions' in df_1.columns:
    gsc_data = True


In [462]:
# ----------------- standardise the column names between ahrefs v1/v2/semrush/gsc keyword exports ----------------------

df_1.rename(
    columns={
        "Current position": "Position",
        "Current URL": "URL",
        "Current URL inside": "Page URL inside",
        "Current traffic": "Traffic",
        "KD": "Difficulty",
        "Keyword Difficulty": "Difficulty",
        "Search Volume": "Volume",
        "page": "URL",
        "query": "Keyword",
        "Top queries": "Keyword",
        "Impressions": "Volume",
        "Clicks": "Traffic",
        "Search term": "Keyword",
        "Impr.": "Volume",
        "Search vol.": "Volume",
    },
    inplace=True,
)

In [463]:
# ------------------------------ check number of imported rows and warn if excessive -----------------------------------

row_len = len(df_1)
if col_len > 1:
    # --------------------------------- clean the data pre-grouping ----------------------------------------------------

    if url_filter:
        print("Processing only URLs containing:", url_filter)

    try:
        df_1 = df_1[df_1["URL"].str.contains(url_filter, na=False)]
    except KeyError:
        pass

    # ========================= clean strings out of numerical columns (adwords) ========================================

    try:
        df_1["Volume"] = df_1["Volume"].str.replace(",", "").astype(int)
        df_1["Traffic"] = df_1["Traffic"].str.replace(",", "").astype(int)
        df_1["Conv. value / click"] = df_1["Conv. value / click"].str.replace(",", "").astype(float)
        df_1["All conv. value"] = df_1["All conv. value"].str.replace(",", "").astype(float)
        df_1["CTR"] = df_1["CTR"].replace(" --", "0", regex=True)
        df_1["CTR"] = df_1["CTR"].str.replace("\%", "").astype(float)
        df_1["Cost"] = df_1["Cost"].astype(float)
        df_1["Conversions"] = df_1["Conversions"].astype(int)
        df_1["Cost"] = df_1["Cost"].round(2)
        df_1["All conv. value"] = df_1["All conv. value"].astype(float)
        df_1["All conv. value"] = df_1["All conv. value"].round(2)

    except Exception:
        pass

    df_1 = df_1[~df_1["Keyword"].str.contains("Total: ", na=False)]  # remove totals rows
    df_1 = df_1[df_1["Keyword"].notna()]  # keep only rows which are NaN
    df_1 = df_1[df_1["Volume"].notna()]  # keep only rows which are NaN
    df_1["Volume"] = df_1["Volume"].astype(str)
    df_1["Volume"] = df_1["Volume"].apply(lambda x: x.replace("0-10", "0"))
    df_1["Volume"] = df_1["Volume"].astype(float).astype(int)

    # drop sitelinks

    if drop_site_links:
        try:
            df_1 = df_1[~df_1["Page URL inside"].str.contains("Sitelinks", na=False)]  # drop sitelinks
        except KeyError:
            pass
        try:
            if gsc_data:
                df_1 = df_1.sort_values(by="Traffic", ascending=False)
                df_1.drop_duplicates(subset="Keyword", keep="first", inplace=True)
        except NameError:
            pass

    if drop_image_links:
        try:
            df_1 = df_1[~df_1["Page URL inside"].str.contains("Image pack", na=False)]  # drop image pack
        except KeyError:
            pass

    df_1 = df_1[df_1["Volume"] > min_volume]

# start strip out all special characters from a column
spec_chars = ["!",'"',"#","%","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    df_1['Keyword'] = df_1['Keyword'].str.replace(char, ' ')

In [464]:
# ------------------------------------- do the grouping ----------------------------------------------------------------

df_1_list = df_1.Keyword.tolist()  # create list from df
model = PolyFuzz("TF-IDF")

cluster_tags = df_1_list[::]
cluster_tags = set(cluster_tags)
cluster_tags = list(cluster_tags)

print("Cleaning up the cluster tags.. Please be patient!")
substrings = {w1 for w1 in tqdm(cluster_tags) for w2 in cluster_tags if w1 in w2 and w1 != w2}
longest_word = set(cluster_tags) - substrings
longest_word = list(longest_word)
shortest_word_list = list(set(cluster_tags) - set(longest_word))

try:
    model.match(df_1_list, shortest_word_list)
except ValueError:
    print("Empty Dataframe, Can't Match - Check the URL Filter!")
    sys.exit()

model.group(link_min_similarity=sim_match_percent)
df_matched = model.get_matches()

Cleaning up the cluster tags.. Please be patient!


100%|██████████| 4981/4981 [00:01<00:00, 2683.57it/s]


In [465]:
# ------------------------------- clean the data post-grouping ---------------------------------------------------------

df_matched.rename(columns={"From": "Keyword", "Group": "Cluster Name"}, inplace=True)  # renaming multiple columns

# merge keyword volume / CPC / Pos / URL etc data from original dataframe back in
df_matched = pd.merge(df_matched, df_1, on="Keyword", how="left")

# rename traffic (acs) / (desc) to 'Traffic for standardisation
df_matched.rename(columns={"Traffic (desc)": "Traffic", "Traffic (asc)": "Traffic", "Traffic potential": "Traffic"}, inplace=True)

if col_len > 1:

    # fill in missing values
    df_matched.fillna({"Traffic": 0, "CPC": 0}, inplace=True)
    df_matched['Traffic'] = df_matched['Traffic'].round(0)
    # ------------------------- group the data and merge in original stats -------------------------------------------------
    if not adwords_check:
        try:
            # make dedicated grouped dataframe
            df_grouped = (df_matched.groupby("Cluster Name").agg(
                {"Volume": sum, "Difficulty": "median", "CPC": "median", "Traffic": sum}).reset_index())
        except Exception:
            df_grouped = (df_matched.groupby("Cluster Name").agg(
                {"Volume": sum, "Traffic": sum}).reset_index())

        df_grouped = df_grouped.rename(
            columns={"Volume": "Cluster Volume", "Difficulty": "Cluster KD (Median)", "CPC": "Cluster CPC (Median)",
                     "Traffic": "Cluster Traffic"})

        df_matched = pd.merge(df_matched, df_grouped, on="Cluster Name", how="left")  # merge in the group stats

    if adwords_check:

        df_grouped = (df_matched.groupby("Cluster Name").agg(
            {"Volume": sum, "CTR": "median", "Cost": sum, "Traffic": sum, "All conv. value": sum, "Conversions": sum}).reset_index())

        df_grouped = df_grouped.rename(
            columns={"Volume": "Cluster Volume", "CTR": "Cluster CTR (Median)", "Cost": "Cluster Cost (Sum)",
                     "Traffic": "Cluster Traffic", "All conv. value": "All conv. value (Sum)", "Conversions": "Cluster Conversions (Sum)"})

        df_matched = pd.merge(df_matched, df_grouped, on="Cluster Name", how="left")  # merge in the group stats

        del df_matched['To']
        del df_matched['Similarity']

    # ---------------------------- clean and sort the final output -----------------------------------------------------

    try:
        df_matched.drop_duplicates(subset=["URL", "Keyword"], keep="first", inplace=True)  # drop if both kw & url are duped
    except KeyError:
        pass

In [466]:
if not adwords_check:
    cols = (
        "Cluster Name",
        "Keyword",
        "Cluster Size",
        "Cluster Volume",
        "Cluster KD (Median)",
        "Cluster CPC (Median)",
        "Cluster Traffic",
        "Volume",
        "Difficulty",
        "CPC",
        "Traffic",
        "URL",
    )

    df_matched = df_matched.reindex(columns=cols)

    try:
        if gsc_data:
            cols = "Cluster Name", "Keyword", "Cluster Size", "Cluster Volume", "Cluster Traffic", "Volume", "Traffic"
            df_matched = df_matched.reindex(columns=cols)
    except NameError:
        pass

In [467]:
# ------------ get the keyword with the highest search volume to replace the auto generated tag name with --------------

if col_len > 1:
    if parent_by_vol:
        df_matched['vol_max'] = df_matched.groupby(['Cluster Name'])['Volume'].transform(max)
        # this sort is mandatory for the renaming to work properly by floating highest values to the top of the cluster
        df_matched.sort_values(["Cluster Name", "Cluster Volume", "Volume"], ascending=[False, True, False], inplace=True)
        df_matched['exact_vol_match'] = df_matched['vol_max'] == df_matched['Volume']
        df_matched.loc[df_matched['exact_vol_match'] == True, 'highest_ranked_keyword'] = df_matched['Keyword']
        df_matched['highest_ranked_keyword'] = df_matched['highest_ranked_keyword'].fillna(method='ffill')
        df_matched['Cluster Name'] = df_matched['highest_ranked_keyword']
        del df_matched['vol_max']
        del df_matched['exact_vol_match']
        del df_matched['highest_ranked_keyword']
if adwords_check:
    df_matched = df_matched.rename(columns={"Volume": "Impressions", "Traffic": "Clicks", "Cluster Traffic": "Cluster Clicks (Sum)"})


In [468]:
# -------------------------------------- final output ------------------------------------------------------------------
# sort on cluster size
df_matched.sort_values(["Cluster Size", "Cluster Name", "Cluster Volume"], ascending=[False, True, False], inplace=True)

try:
    if gsc_data:
        df_matched.rename(
            columns={"Cluster Volume": "Cluster Impressions", "Cluster Traffic": "Cluster Clicks", "Traffic": "Clicks",
                     "Volume": "Impressions"}, inplace=True)
except NameError:
    pass

if col_len == 1:
    cols = "Cluster Name", "Keyword", "Cluster Size"
    df_matched = df_matched.reindex(columns=cols)

In [469]:
print(df_matched)

          Cluster Name                   Keyword  ...  Traffic  URL
345   100 cotton socks          100 cotton socks  ...   2200.0  NaN
698   100 cotton socks         100  cotton socks  ...   2100.0  NaN
1726  100 cotton socks     100 cotton socks asda  ...   3800.0  NaN
2112  100 cotton socks       100 cotton socks uk  ...   2100.0  NaN
2409  100 cotton socks  100 percent cotton socks  ...   2100.0  NaN
...                ...                       ...  ...      ...  ...
2042        zara socks               farah socks  ...     10.0  NaN
2221        zara socks             tractor socks  ...     30.0  NaN
2590        zara socks      mens character socks  ...    300.0  NaN
3668        zara socks           character socks  ...    300.0  NaN
3821        zara socks          sim racing socks  ...     10.0  NaN

[5038 rows x 12 columns]


In [470]:
# - add in intent markers
colname = df_matched.columns[1]
df_matched.loc[df_matched[colname].str.contains(info_filter), "Informational"] = "Informational"
df_matched.loc[df_matched[colname].str.contains(comm_invest_filter), "Commercial Investigation"] = "Commercial Investigation"
df_matched.loc[df_matched[colname].str.contains(trans_filter), "Transactional"] = "Transactional"

In [471]:
# find keywords from one column in another in any order and count the frequency
df_matched['Cluster Name'] = df_matched['Cluster Name'].str.strip()
df_matched['Keyword'] = df_matched['Keyword'].str.strip()

df_matched['First Word'] = df_matched['Cluster Name'].str.split(" ").str[0] 
df_matched['Second Word'] = df_matched['Cluster Name'].str.split(" ").str[1] 
df_matched['Total Keywords'] = df_matched['First Word'].str.count(' ') + 1

def ismatch(s):
    A = set(s["First Word"].split())
    B = set(s['Keyword'].split())
    return A.intersection(B) == A

df_matched['Found'] = df_matched.apply(ismatch, axis=1)

df_matched = df_matched. fillna('')

def ismatch(s):
    A = set(s["Second Word"].split())
    B = set(s['Keyword'].split())
    return A.intersection(B) == A
df_matched['Found 2'] = df_matched.apply(ismatch, axis=1)

# todo - document this algo. Essentially if it matches on the second word only, it renames the cluster to the second word
# clean up code nd variable names

df_matched.loc[(df_matched["Found"] == False) & (df_matched["Found 2"] == True), "Cluster Name"] = df_matched["Second Word"]
df_matched.loc[(df_matched["Found"] == False) & (df_matched["Found 2"] == False), "Cluster Name"] = "zzz_no_cluster_available"

# count cluster_size
df_matched['Cluster Size'] = df_matched['Cluster Name'].map(df_matched.groupby('Cluster Name')['Cluster Name'].count())
df_matched.loc[df_matched["Cluster Size"] == 1, "Cluster Name"] = "zzz_no_cluster_available"


df_matched = df_matched.sort_values(by="Cluster Name", ascending=True)

#delete the helper cols
del df_matched['First Word']
del df_matched['Second Word']
del df_matched['Total Keywords']
del df_matched['Found']
del df_matched['Found 2']

In [472]:
df_matched.to_csv('your_keywords_clustered.csv', index=False)
files.download("your_keywords_clustered.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>