In [None]:
%load_ext lab_black

In [None]:
import re
import nltk
import string
import requests
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Try values like what, how
original_query = "how"

namespace = "suggest"
Path(namespace).mkdir(exist_ok=True, parents=True)

base_url = "http://suggestqueries.google.com/complete/search?client=firefox&&q="
url = f"{base_url}{original_query}"
response = requests.get(url)
table = []
for i, suggestion in enumerate(response.json()[1]):
    row = (original_query, i + 1, suggestion)
    table.append(row)

In [None]:
probe_words = """about
above
after
again
against
all
am
an
and
any
are
aren't
as
at
be
because
been
before
being
below
between
both
but
by
can
couldn't
did
didn't
do
does
doesn't
doing
don't
down
during
each
few
for
from
further
had
hadn't
has
hasn't
have
haven't
having
he
her
here
hers
herself
him
himself
his
how
if
in
into
is
isn't
it
it's
its
itself
just
me
mightn't
more
most
mustn't
my
myself
needn't
no
nor
not
now
of
off
on
once
only
or
other
our
ours
ourselves
out
over
own
re
same
shan't
she
she's
should
should've
shouldn't
so
some
such
than
that
that'll
the
their
theirs
them
themselves
then
there
these
they
this
those
through
to
too
under
until
up
very
was
wasn't
we
were
weren't
what
when
where
which
while
who
whom
why
will
with
won't
wouldn't
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves""".split(
    "\n"
)

In [None]:
lowercase_letters = f" {string.ascii_lowercase}"
len_letters = len(lowercase_letters)
for letter in lowercase_letters:
    query = f"{original_query} {letter}"
    url = f"{base_url}{query}"
    response = requests.get(url)
    for suggestion in response.json()[1]:
        row = (query, i + 1, suggestion)
        table.append(row)

In [None]:
for word in probe_words:
    query = f"{original_query} {word} "
    url = f"{base_url}{query}"
    response = requests.get(url)
    for i, suggestion in enumerate(response.json()[1]):
        row = (query, i + 1, suggestion)
        table.append(row)

In [None]:
df = pd.DataFrame(table, columns=["seed", "position", "suggestion"])
filename = f"suggestions-{original_query.replace(' ', '_')}.csv"
df.to_csv(f"{namespace}/{filename}", index=False)
print("Done")

In [None]:
c = Counter()

stop_words = set(stopwords.words("english"))

inverter = {1: 10, 2: 9, 3: 8, 4: 7, 5: 6, 6: 5, 7: 4, 8: 3, 9: 2, 10: 1, 11: 1}


def sort_counter_descending(counter):
    return sorted(counter.items(), key=lambda x: x[1], reverse=True)


def cluster_keywords(keywords, num_clusters):
    # Convert the keywords to a TF-IDF matrix
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(keywords)

    # Perform k-means clustering on the TF-IDF matrix
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans.fit(X)

    # Get the cluster assignments for each keyword
    cluster_assignments = kmeans.labels_
    return cluster_assignments


lowr = lambda x: x.lower()
alfa = lambda x: re.sub(r"[^a-zA-Z\s]", "", x)
sspc = lambda x: re.sub(" +", " ", x)
flat = lambda x: " ".join(x)
tkns = lambda x: word_tokenize(x)
nstp = lambda x: flat([y for y in tkns(lowr(x)) if y not in stop_words])

# Establish all counters
for kw in table:
    c[kw[2]] += 1

root_hist = Counter()
for kw in table:
    query, position, suggestion = kw

    # Give suggestions from seed term a boost.
    if query == original_query:
        c[suggestion] += 5

    # Boost each keyword based on its suggestion position.
    c[suggestion] += inverter[position]

    # Boost when highly specific yet still suggested
    delta = suggestion.replace(original_query, "").strip()
    delta_num_words = len(delta.split())
    if delta_num_words > 1:
        c[suggestion] += (delta_num_words - 1) * 3

    # Create histogram of cleaned deltas
    root = nstp(delta)
    if root:
        root_hist[root] += 1

for kw in table:
    query, position, suggestion = kw
    delta = suggestion.replace(original_query, "").strip()
    root = nstp(delta)
    c[suggestion] += root_hist[root] * 2

# sort_counter_descending(c)

In [None]:
# Example list of keywords
keywords = list(root_hist.keys())
num_keywords = len(keywords)

# Cluster the keywords into 3 clusters
print("Finding least number of clusters that produce only 2 empty groups.")
for i in range(2, int(num_keywords / 2)):
    num_clusters = int(num_keywords / i)
    print(f"Keywords: {num_keywords}")
    print(f"Number of Clusters: {num_clusters}")
    cluster_assignments = cluster_keywords(keywords, num_clusters)
    # print(cluster_assignments)
    num_zeros = np.count_nonzero(cluster_assignments == 0)
    max_group = np.amax(cluster_assignments)
    print(f"Max: {max_group}")
    print(f"Number of zeros: {num_zeros}")
    if num_zeros >= 3:
        denominator = i - 1
        print(f"Denominator to use: {int(denominator)}")
        break
num_clusters = int(num_keywords / denominator)
cluster_assignments = cluster_keywords(keywords, num_clusters)
print(cluster_assignments)
print("Done")

In [None]:
# Mab beloingings between keywords and thier clusters
cluster_dict = {}
for i, keyword in enumerate(keywords):
    cluster_dict[keyword] = cluster_assignments[i]
# cluster_dict

In [None]:
# Produce keyword groups using numbers as the group names.

sublist_dict = {}
for i, keyword in enumerate(keywords):
    group_number = cluster_assignments[i]
    if group_number in sublist_dict:
        # Encountered group number before. Append new keyword.
        sublist_dict[group_number].append(keyword)
    else:
        # First time group number is encountered make 1-val list.
        sublist_dict[group_number] = [keyword]
print("Done")

In [None]:
# Function to return a counter of 2-word combos
def common2(keywords):
    # Split each keyword into a list of words
    words_list = [keyword.split() for keyword in keywords]

    # Create a list of all 2-word combinations
    two_word_combinations = []
    for words in words_list:
        for i in range(len(words) - 1):
            two_word_combinations.append(f"{words[i]} {words[i + 1]}")

    # Count the occurrences of each 2-word combination
    two_word_counter = Counter(two_word_combinations)

    return two_word_counter

In [None]:
# This maps a most frequent 2-word combo to replace the numbered group names.
named_sublist_dict = {}
p = False
for group_number in sublist_dict:
    keywords = sublist_dict[group_number]
    best2s = common2(keywords)
    if p:
        print(f"Keywords: {keywords}")
        print(f"The best 2-word combos: {best2s}")
    if group_number not in named_sublist_dict:
        for candidate in best2s:
            if candidate not in named_sublist_dict:
                named_sublist_dict[group_number] = candidate
                if p:
                    print(f'"{candidate}" used for group number {group_number}.')
                break
print("Done")

In [None]:
# This uses the top 2-word combo as the name for each keyword group.
named_clusters = [
    (named_sublist_dict[x], sublist_dict[x])
    for x in sublist_dict
    if x in named_sublist_dict
]
dict_o_sets = dict([(x[0], set([x[0]] + x[1])) for x in named_clusters])

word_values = dict(
    [
        (x[0].replace(original_query, "").strip(), x[1])
        for x in sort_counter_descending(c)
    ]
)

group_values = [
    (x, [word_values[y] for y in dict_o_sets[x] if y in word_values])
    for x in dict_o_sets
]
group_scores = [(x[0], round(sum(x[1]) / len(x[1]), 2)) for x in group_values if x[1]]
most_valuable_groups = dict(sorted(group_scores, key=lambda x: x[1], reverse=True))

table = []
for group_name in most_valuable_groups:
    score = most_valuable_groups[group_name]
    for keyword in dict_o_sets[group_name]:
        row = (group_name, keyword, score)
        table.append(row)

df2 = pd.DataFrame(table, columns=["Group Name", "Keyword", "Group Score"])

cluster_csv = f"cluster-{original_query.replace(' ', '_')}.csv"
df2.to_csv(f"{namespace}/{cluster_csv}", index=False)
print("Done")