<a href="https://colab.research.google.com/github/searchsolved/search-solved-public-seo/blob/main/Keyword_Clustering_Tool/Keyword_Clustering_Tool_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keyword Clustering Tool
Automatic keyword clustering tool. See yours and your competitors most profitable keyword clusters in a couple of clicks.

# Instructions
Run all the cells and upload a CSV export from either:

*   Ahrefs.com 
*   SEMRush.com
*   Search Console (Coverage Report CSV Export (Queries.csv))

October 2021

More like this: https://searchsolved.co.uk/blog/


In [None]:
!pip install pandas
!pip install polyfuzz



In [None]:
import pandas as pd
import sys
from google.colab import files
from polyfuzz import PolyFuzz

In [None]:
parent_by_vol = True  # rename the cluster with highest search vol KW
drop_site_links = True  # Drop sitelinks from ahrefs and GSC 
drop_image_links = True  # Drop Image Pack Links
sim_match_percent = 0.99  # Set the similarity matching for grouping
url_filter = ""  # filter on specific URLs (e.g /blog/)

#Upload the Keyword Export File from Ahrefs, SEMrush or Search Console Coverage Report (Queries.csv)

*   This file should be a report of all the keywords a site is ranking for.
*   .csv files from ahrefs or semrush should be exported in UTF-8 Format



In [None]:
# upload the keyword export
upload = files.upload()
upload = list(upload.keys())[0]  # get the name of the uploaded file
df_1 = pd.read_csv(upload, error_bad_lines=False, encoding="utf8")

In [None]:
# Check if csv data is gsc and set bool

if 'Impressions' in df_1.columns:
    gsc_data = True

# standardise the column names between ahrefs v1/v2/semrush/gsc keyword exports

df_1.rename(
    columns={
        "Current position": "Position",
        "Current URL": "URL",
        "Current URL inside": "Page URL inside",
        "Current traffic": "Traffic",
        "KD": "Difficulty",
        "Keyword Difficulty": "Difficulty",
        "Search Volume": "Volume",
        "page": "URL",
        "query": "Keyword",
        "Top queries": "Keyword",
        "Impressions": "Volume",
        "Clicks": "Traffic"
    },
    inplace=True,
)

In [None]:
# clean the data pre-grouping

if url_filter:
    print("Processing only URLs containing:", url_filter)

try:
    df_1 = df_1[df_1["URL"].str.contains(url_filter, na=False)]
except KeyError:
    pass

df_1 = df_1[df_1["Keyword"].notna()]  # keep only rows which are NaN
df_1 = df_1[df_1["Volume"].notna()]  # keep only rows which are NaN
df_1["Volume"] = df_1["Volume"].astype(str)
df_1["Volume"] = df_1["Volume"].apply(lambda x: x.replace("0-10", "0"))
df_1["Volume"] = df_1["Volume"].astype(float).astype(int)


if drop_site_links:
    try:
        df_1 = df_1[~df_1["Page URL inside"].str.contains("Sitelinks", na=False)]  # drop sitelinks
    except KeyError:
        pass
    try:
        if gsc_data:
            df_1 = df_1.sort_values(by="Traffic", ascending=False)
            df_1.drop_duplicates(subset="Keyword", keep="first", inplace=True)
    except NameError:
        pass

if drop_image_links:
    try:
        df_1 = df_1[~df_1["Page URL inside"].str.contains("Image pack", na=False)]  # drop image pack
    except KeyError:
        pass

In [None]:
# do the grouping

df_1_list = df_1.Keyword.tolist()  # create list from df
model = PolyFuzz("TF-IDF")
try:
    model.match(df_1_list, df_1_list)
except ValueError:
    print("Empty Dataframe, Can't Match - Check the URL Filter!")
    sys.exit()
model.group(link_min_similarity=sim_match_percent)
df_matched = model.get_matches()


In [None]:
# clean the data post-grouping

df_matched.rename(columns={"From": "Keyword", "Group": "Cluster Name"}, inplace=True)  # renaming multiple columns

# merge keyword volume / CPC / Pos / URL etc data from original dataframe back in
df_matched = pd.merge(df_matched, df_1, on="Keyword", how="left")

# rename traffic (acs) / (desc) to 'Traffic for standardisation
df_matched.rename(columns={"Traffic (desc)": "Traffic", "Traffic (asc)": "Traffic"}, inplace=True)

# fill in missing values
df_matched.fillna({"Traffic": 0, "CPC": 0}, inplace=True)
df_matched['Traffic'] = df_matched['Traffic'].round(0)

In [None]:
# group the data and merge in original stats

try:
    # make dedicated grouped dataframe
    df_grouped = (df_matched.groupby("Cluster Name").agg(
        {"Volume": sum, "Difficulty": "median", "CPC": "median", "Traffic": sum}).reset_index())
except Exception:
    df_grouped = (df_matched.groupby("Cluster Name").agg(
        {"Volume": sum, "Traffic": sum}).reset_index())


df_grouped = df_grouped.rename(
    columns={"Volume": "Cluster Volume", "Difficulty": "Cluster KD (Median)", "CPC": "Cluster CPC (Median)",
             "Traffic": "Cluster Traffic"})

df_matched = pd.merge(df_matched, df_grouped, on="Cluster Name", how="left")  # merge in the group stats

In [None]:
# clean and sort the final output

try:
    df_matched.drop_duplicates(subset=["URL", "Keyword"], keep="first", inplace=True)  # drop if both kw & url are duped
except KeyError:
    pass

cols = (
    "Keyword",
    "Cluster Name",
    "Cluster Size",
    "Cluster Volume",
    "Cluster KD (Median)",
    "Cluster CPC (Median)",
    "Cluster Traffic",
    "Volume",
    "Difficulty",
    "CPC",
    "Traffic",
    "URL",
)

df_matched = df_matched.reindex(columns=cols)

try:
    if gsc_data:
        cols = "Keyword", "Cluster Name", "Cluster Size", "Cluster Volume", "Cluster Traffic", "Volume", "Traffic"
        df_matched = df_matched.reindex(columns=cols)
except NameError:
    pass

# count cluster size
df_matched['Cluster Size'] = df_matched['Cluster Name'].map(df_matched.groupby('Cluster Name')['Cluster Name'].count())

df_matched.loc[df_matched['Cluster Size'] > 1, 'Clustered?'] = True
df_matched['Clustered?'] = df_matched['Clustered?'].fillna(False)

In [None]:
# get the KW with the highest search vol to replace the auto gen'd cluster name

if parent_by_vol:
    df_matched['vol_max'] = df_matched.groupby(['Cluster Name'])['Volume'].transform(max)
    # this sort is mandatory for the renaming to work properly by floating highest values to the top of the cluster
    df_matched.sort_values(["Cluster Name", "Cluster Volume", "Volume"], ascending=[False, True, False], inplace=True)
    df_matched['exact_vol_match'] = df_matched['vol_max'] == df_matched['Volume']
    df_matched.loc[df_matched['exact_vol_match'] == True, 'highest_ranked_keyword'] = df_matched['Keyword']
    df_matched['highest_ranked_keyword'] = df_matched['highest_ranked_keyword'].fillna(method='ffill')
    df_matched['Cluster Name'] = df_matched['highest_ranked_keyword']
    del df_matched['vol_max']
    del df_matched['exact_vol_match']
    del df_matched['highest_ranked_keyword']

In [None]:
# -------------------------------------- final output ------------------------------------------------------------------

# sort on cluster size
df_matched.sort_values(["Cluster Size", "Cluster Name", "Cluster Volume"], ascending=[False, True, False], inplace=True)

try:
    if gsc_data:
        df_matched.rename(columns={"Cluster Volume": "Cluster Impressions", "Cluster Traffic": "Cluster Clicks", "Traffic": "Clicks", "Volume": "Impressions"}, inplace=True)
except NameError:
    pass

df_matched.to_csv('clustered_keywords.csv', index=False)
files.download("clustered_keywords.csv")
print("Finished Clustering!")