In [1]:
import re
import pickle
import random
import os
import sys

import numpy as np
import pandas as pd
# import hdbscan
# import umap
import sklearn.cluster as cluster
import plotly.express as px
import matplotlib.pyplot as plt
import clusteval
import compress_fasttext

from typing import List
from sklearn.decomposition import PCA
from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_score,
    completeness_score,
    v_measure_score,
    silhouette_score
)
from scipy.linalg import norm
from scipy.spatial.distance import pdist, squareform
from stop_words import get_stop_words
from gensim.models import FastText

SEED = 2023
VECTOR_SIZE = 16
DATA_PATH = '../data/'
MODEL_PATH = '../nlp_model/'

In [2]:
SCRIPT_DIR = os.path.dirname(os.path.abspath('./jupyter_hb'))
sys.path.append(os.path.dirname(SCRIPT_DIR))

from lib.nlp_utils import Preprocessing

In [3]:
with open("../resumes.pickle", "rb") as f:
    new_resumes = pickle.load(f)

In [4]:
VECTORIZER = compress_fasttext.models.CompressedFastTextKeyedVectors.load(MODEL_PATH + 'small_model')

In [5]:
random.seed(SEED)

data = (
    pd.DataFrame
    .from_records(new_resumes)
    .assign(
        one_name=lambda df: df['name'].apply(lambda txt: re.split('[,/.]', txt)[0].strip('./!? '))
    )
    .reset_index()
    .rename(columns={'index': 'id'})
)
proc = Preprocessing()
data['tokens_for_clustering'] = proc.process_texts(data, 'one_name')
clustered_data = (
    data
    .loc[data['tokens_for_clustering'].apply(lambda x: len(x) != 0)]
    .assign(
        ft_vectors=lambda df: df['tokens_for_clustering'].apply(
            lambda txt: np.array([VECTORIZER[token] for token in txt]).mean(axis=0)
        )
    )
)[['id', 'one_name', 'ft_vectors']]

ft_vectors = np.concatenate(
    clustered_data['ft_vectors'].values
).reshape(clustered_data.shape[0], -1)

In [12]:
ALGORITHMS = [
    'K-means',
    'Affinity',
    'Spectral',
    'Agglomerative',
    'DBSCAN',
    'MeanShift'
]


def get_scores(algo_names: List[str], vectors: np.ndarray):
    algorithms = []
    
    if 'K-means' in set(algo_names):
        algorithms.append(cluster.KMeans(n_init=10, random_state=SEED))
    if 'Affinity' in set(algo_names):
        algorithms.append(cluster.AffinityPropagation(random_state=SEED))
    if 'Spectral'in set(algo_names):
        algorithms.append(cluster.SpectralClustering(random_state=SEED))
    if 'Agglomerative' in set(algo_names):
        algorithms.append(cluster.AgglomerativeClustering())
    if 'DBSCAN' in set(algo_names):
        algorithms.append(cluster.DBSCAN())
    if 'MeanShift' in set(algo_names):
        algorithms.append(cluster.MeanShift())
        
    scores = []
    for algo, name in zip(algorithms, algorithms):
        algo.fit(vectors)
        scores.append(({
            'Silhouette': silhouette_score(vectors, algo.labels_)}))

    results = pd.DataFrame(data=scores, columns=['Silhouette'],
                           index=algorithms)
    return results

In [13]:
get_scores(ALGORITHMS, ft_vectors)



Unnamed: 0,Silhouette
"KMeans(n_init=10, random_state=2023)",0.548305
AffinityPropagation(random_state=2023),0.447761
SpectralClustering(random_state=2023),0.517995
AgglomerativeClustering(),0.346512
DBSCAN(),0.478731
MeanShift(),0.300643
