In [None]:
import sys, os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools

sys.path.append(os.path.abspath(os.path.join('.', '..')))
sys.path.append(os.path.abspath(os.path.join('.', 'src')))
sys.path.append(os.path.abspath(os.path.join('.', 'utils')))

# import utils.str_manip as str_manip
# import utils.feature_builder as feature_builder
import src.job_title_clustering.cluster as cluster

## Load the original data and extract features

In [None]:
# load existing clusters
orig_df = pd.read_csv('../data/original_clusters.csv')
# keep only first 2 columns
orig_df = orig_df[['Title', 'Cluster']]

### We can use other classes to perform low-level functionality directly (for testing purposes, etc.), but the Cluster class is already subclassed.

In [None]:
# sm = str_manip.StrManip()
# fb = feature_builder.FeatureBuilder()
cl = cluster.Cluster()

In [None]:
# pass in the raw data to get out features
feats = cl.get_features(orig_df['Title'], True)

## Create the processing pipeline
### Configurations for all possible combinations of the processing pipeline
The kwargs for clustering algorithms are algorithm specific.
Outer arguments are used for "global" variables and pipeline building:
- scaler: choose which scaling factor to use, or None
- dim_reduce: choose which dimensionality reduction algorithm to use, or None
- clusterer: choose which clustering algorithm to use.
- n_clusters: the number of clusters to aim for during clustering

In [None]:
cluster_config = {
    "kmeans_kwargs": {
        "init": "k-means++",
        "n_init": 50,
        "max_iter": 500,
        "random_state": 2901,
    },
    "hdbscan_kwargs": {
        'min_cluster_size': 15,
        'min_samples': 4,
        'metric': 'euclidean',
    },
    "agglom_kwargs": {
        'metric': 'euclidean',
        'linkage': 'ward',
    },
    "cluster": "kmeans", # kmeans, hdbscan, agglom
    "scaler": "minmax", # std, minmax
    "dim_reduce": "pca", # pca, umap
    "n_clusters": 7,
}

### See how the data is grouped after dimensionality reduction

In [None]:
dim_red = cluster.Cluster()
cluster_config['dim_reduce'] = 'pca'
pipe = dim_red.setup_pipeline(**cluster_config)
pipe.fit(feats)
df = dim_red.run_pipeline(pipe, feats, orig_df['Cluster'])
dim_red.show_simple_scatterplot(df)

The above graph shows that PCA has grouped data into dense clusters of varying shapes. HDBSCAN would be a good candidate clustering algorithm for this dimensionality reduction.

In [None]:
dim_red = cluster.Cluster()
cluster_config['dim_reduce'] = 'umap'
pipe = dim_red.setup_pipeline(**cluster_config)
pipe.fit(feats)
df = dim_red.run_pipeline(pipe, feats, orig_df['Cluster'])
dim_red.show_simple_scatterplot(df)

The above graph shows that UMAP has grouped data into more convex-looking clusters. k-means would be a good candidate clustering algorithm for this dimensionality reduction.

### Find the best number of clusters

Let's find the best number of clusters for k-means.

In [None]:
cluster_config['cluster'] = 'kmeans'
cluster_config['dim_reduce'] = 'umap'
cl.find_best_n_components(feats, cluster_config, orig_df['Cluster'])

Looks like the best value for `n_components` could be 7.
Let's search over hyperparameters for HDBSCAN to get as close as possible to `n_components = 7`.

Running grid search for HDBSCAN...

In [None]:
plt.figure(figsize=(6, 6))
n_rows = 5
n_cols = 5
fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, sharex=True, sharey=True, figsize=(12, 12))
subplot_i = 1
cluster_config['cluster'] = 'hdbscan'
cluster_config['dim_reduce'] = 'pca'

for i, (x, y) in enumerate(list(itertools.product(range(9, 24, 3), (range(1, 16, 3))))):
    cluster_config['hdbscan_kwargs']['min_cluster_size'] = x
    cluster_config['hdbscan_kwargs']['min_samples'] = y
    pipe = cl.setup_pipeline(**cluster_config)
    pipe.fit(feats)
    df = cl.run_pipeline(pipe, feats, orig_df['Cluster'])
    n_clusters = len(df['predicted_cluster'].unique())-1

    ax[i//n_cols][i%n_cols].plot([-1, 1], [-1.6, -1.6], color='k', lw=2)
    plt.subplot(n_rows, n_cols, subplot_i)
    plt.title(f"({x}, {y}), size {n_clusters}", fontsize='small')
    plt.scatter(df['component_1'], df['component_2'], marker='o', c=df['predicted_cluster'], s=25, edgecolors='k')

    subplot_i += 1

Looks like `min_cluster_size=15` and `min_samples=4` are good values for the hyperparameters, giving us a cluster size of 7.

In [None]:
# set some new variables
cluster_config['n_clusters'] = 7
cluster_config['hdbscan_kwargs']['min_cluster_size'] = 15
cluster_config['hdbscan_kwargs']['min_samples'] = 4

In [None]:
cluster_config['cluster'] = 'hdbscan'
cluster_config['dim_reduce'] = 'pca'
cl.find_best_n_components(feats, cluster_config, orig_df['Cluster'])

We can see that performance is fairly good at 7 clusters for HDBSCAN as well. Even though performance increases with more components, I want to keep analysis of cluster simpler for now.

### Choose a clustering algorithm and set up the pipeline

We start with k-means + UMAP.

In [None]:
cluster_config['cluster'] = 'kmeans'
cluster_config['dim_reduce'] = 'umap'
pipe = cl.setup_pipeline(**cluster_config)
pipe["dim_reducer"][cluster_config['dim_reduce']].n_components = 2

pipe.fit(feats)

kmeans_df = cl.run_pipeline(pipe, feats, orig_df['Cluster'])
kmeans_labels = cl.pred_lbl

In [None]:
cl.show_scatterplot(kmeans_df)

Now we do the same for HDBSCAN + PCA.

In [None]:
cluster_config['cluster'] = 'hdbscan'
cluster_config['dim_reduce'] = 'pca'
pipe = cl.setup_pipeline(**cluster_config)
pipe["dim_reducer"][cluster_config['dim_reduce']].n_components = 2

pipe.fit(feats)

hdbscan_df = cl.run_pipeline(pipe, feats, orig_df['Cluster'])
hdbscan_labels = cl.pred_lbl

In [None]:
cl.show_scatterplot(hdbscan_df)

## Analyze and create new clusters

The following graphs show top extracted words using NMF, PLSA, and LDA. These top words can be used to create or enhance clusters.

In [None]:
top_words = cl.topic_analysis(orig_df['Title'], feats, n_components=7, n_top_words=15)

We can use LLMs to help us categorize these groups of top words. The output of the LLM will not be directly used as a cluster definition, but rather as a starting point for generating ideas about how to define the clusters.

In [None]:
import ollama

categories = {}
instruction_prompt = "You are a helpful chatbot. Use only the following pieces of context to answer the question. Don't make up any new information:\n"
user_prompt = "Categorize the following group of data into one category, using as few words as possible:\n"

for model, data in top_words.items():
    categories[model] = []

    for d in data:
        response = ollama.chat(
            model='hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF',
            messages=[
                {'role': 'system', 'content': instruction_prompt},
                {'role': 'user', 'content': user_prompt + ','.join(d)},
            ],
        )
        categories[model].append(response['message']['content'])

categories

In [None]:
# put top words into a dataframe for easier visual comparison
tw_df = pd.DataFrame.from_dict(top_words).explode(['nmf', 'plsa', 'lda']).reset_index()
tw_nmf_df = tw_df[['index', 'nmf']].sort_values(by=['index', 'nmf']).reset_index(drop=True)
tw_plsa_df = tw_df[['index', 'plsa']].sort_values(by=['index', 'plsa']).reset_index(drop=True)
tw_lda_df = tw_df[['index', 'lda']].sort_values(by=['index', 'lda']).reset_index(drop=True)
tw_df = tw_nmf_df[['nmf']].join(tw_plsa_df, how='left')
tw_df = tw_df[['nmf', 'plsa']].join(tw_lda_df, how='left')

In [None]:
tw_df[tw_df['index'] == 6]

Taking these categories, together with the top words, we can define our own clusters. These cluster definitions and their associated top words can then be used in a KNN algorithm to assign new clusters to our original data.

In [None]:
curated_categories = {
    'nmf': [
        'Marketing',
        'Product Management',
        'Leadership',
        'Data Analytics',
        'Leadership',
        'IT Engineering',
        'Data Analytics',
    ],
    'plsa': [
        'Product Management',
        'Product Management',
        'Leadership',
        'Data Analytics',
        'Product Management',
        'IT Engineering',
        'Data Analytics',
    ],
    'lda': [
        'Data Analytics',
        'Leadership',
        'Data Analytics',
        'Product Management',
        'Product Management',
        'Data Analytics',
        'Leadership',
    ]
}

First, we compile the top words from our topic extraction into feature vectors.

In [None]:
top_words_by_cat = {}
top_cats = set()
# remove words that appear across multiple topics
# this is a hand-curated list of words that are ambiguous or possibly semantically vacuous in this context
remove_words = [
    'executive', 'vice', 'manager', 'senior', 'head', 'director', 'president', 'chief'
]
total_categories = 0

# create a list of words under each new curated category based on the extracted words from each topic extraction model
for model, cats in curated_categories.items():
    for i, cat in enumerate(cats):
        if cat not in top_words_by_cat:
            top_words_by_cat[cat] = set()
            top_cats.update([cat])
            total_categories += 1
        
        top_words_by_cat[cat].update([w for w in top_words[model][i] if w not in remove_words])

top_cats = list(top_cats)
top_cats.sort()
top_words_feats = np.zeros((len(cl.vocab), total_categories))

# create a feature vector of the top words based on the vocabulary from the original data
for i in range(total_categories):
    curr_cat = top_cats[i]

    for word in top_words_by_cat[curr_cat]:
        top_words_feats[cl.vocab.index(word)][i] = 1

top_words_feats = top_words_feats.T

In [None]:
top_cats

Then, we scale the features and run dimensionality reduction (I am using PCA and UMAP for each clustering algorithm).

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from umap import UMAP

scaler = MinMaxScaler()
pca = PCA(n_components=2, random_state=2901)
ump = UMAP(n_components=2, n_neighbors=5, random_state=2901)

top_words_feats_pca = top_words_feats.copy()
top_words_feats_pca = pca.fit_transform(top_words_feats_pca)
top_words_feats_pca = scaler.fit_transform(top_words_feats_pca)
top_words_pca_df = pd.DataFrame(top_words_feats_pca)

top_words_feats_ump = top_words_feats.copy()
top_words_feats_ump = ump.fit_transform(top_words_feats_ump)
top_words_feats_ump = scaler.fit_transform(top_words_feats_ump)
top_words_ump_df = pd.DataFrame(top_words_feats_ump)

The scatterplots show where the centroid of each category resides.

In [None]:
plt.scatter(x=top_words_pca_df[0], y=top_words_pca_df[1])

In [None]:
plt.scatter(x=top_words_ump_df[0], y=top_words_ump_df[1])

In [None]:
new_df = orig_df.copy()
new_df['new_pca_cluster'] = 'Other'
new_df['new_ump_cluster'] = 'Other'
new_pca_feats = pca.fit_transform(feats)
new_pca_feats = scaler.fit_transform(new_pca_feats)
new_ump_feats = ump.fit_transform(feats)
new_ump_feats = scaler.fit_transform(new_ump_feats)

Finally, we can set up the KNN classifier. I tried out different values for `n_neighbors`, `weights`, and `metric` that eventually gave me some reasonable results.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

pca_knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
ump_knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

pca_knn.fit(top_words_feats_pca, top_cats)
ump_knn.fit(top_words_feats_ump, top_cats)

In [None]:
for idx, point in enumerate(new_pca_feats):
    pred_lbl = pca_knn.predict([tuple(point)])[0]
    new_df['new_pca_cluster'].at[idx] = pred_lbl

for idx, point in enumerate(new_ump_feats):
    pred_lbl = ump_knn.predict([tuple(point)])[0]
    new_df['new_ump_cluster'].at[idx] = pred_lbl

Save the new clusters to a CSV file.

In [None]:
new_df.to_csv('../data/new_clusters.csv')

In [None]:
# load existing clusters
new_df = pd.read_csv('../data/new_clusters.csv')
# keep only 2 columns
new_df = new_df[['Title', 'new_pca_cluster', 'new_ump_cluster']]

ncl = cluster.Cluster()
# pass in the raw data to get out features
nfeats = ncl.get_features(new_df['Title'], True)

Here are a couple of graphs showing the performance of k-means and HDBSCAN over the data now.

In [None]:
cluster_config['cluster'] = 'kmeans'
cluster_config['dim_reduce'] = 'umap'
ncl.find_best_n_components(nfeats, cluster_config, new_df['new_ump_cluster'])

In [None]:
cluster_config['cluster'] = 'hdbscan'
cluster_config['dim_reduce'] = 'pca'
ncl.find_best_n_components(nfeats, cluster_config, new_df['new_pca_cluster'])

Both graphs look like there are much better results this time!

We can also look at the clustering results again.

In [None]:
cluster_config['cluster'] = 'kmeans'
cluster_config['dim_reduce'] = 'umap'
cluster_config['n_clusters'] = 5
new_pipe = ncl.setup_pipeline(**cluster_config)
new_pipe["dim_reducer"][cluster_config['dim_reduce']].n_components = 2

new_pipe.fit(nfeats)

new_kmeans_df = ncl.run_pipeline(new_pipe, nfeats, new_df['new_ump_cluster'])

In [None]:
ncl.show_scatterplot(new_kmeans_df)

In [None]:
cluster_config['cluster'] = 'hdbscan'
cluster_config['dim_reduce'] = 'pca'
cluster_config['hdbscan_kwargs']['min_cluster_size'] = 20
cluster_config['hdbscan_kwargs']['min_samples'] = 3
new_pipe = ncl.setup_pipeline(**cluster_config)
new_pipe["dim_reducer"][cluster_config['dim_reduce']].n_components = 2

new_pipe.fit(nfeats)

new_hdbscan_df = ncl.run_pipeline(new_pipe, nfeats, new_df['new_pca_cluster'])

In [None]:
ncl.show_scatterplot(new_hdbscan_df)

I noticed how HDBSCAN is having trouble with the less dense groupings of data near the top and bottom of the graph, so I decided to try an Agglomerative Clustering algorithm.

In [None]:
cluster_config['cluster'] = 'agglom'
cluster_config['dim_reduce'] = 'pca'
cluster_config['n_clusters'] = 4
new_pipe = ncl.setup_pipeline(**cluster_config)
new_pipe["dim_reducer"][cluster_config['dim_reduce']].n_components = 2

new_pipe.fit(nfeats)

new_agglom_df = ncl.run_pipeline(new_pipe, nfeats, new_df['new_pca_cluster'])

In [None]:
ncl.show_scatterplot(new_agglom_df)

The groupings look much better with the Agglomerative Clustering algorithm!

Also of note is that the "IT Engineering" cluster is not shown in the graphs that used PCA. It is most likely that those data points were not picked up during KNN calculations because of the dimensionality reduction process.

It looks like the clusterings are more homogeneous now, but additional work needs to be done to better define these topics. Next steps could be to investigate the clusters for patterns, and/or to involve subject-matter experts to refine definitions. We can also try out different cluster sizes and number of clusters, given that the definition and distribution of the data may have changed during analysis.