## Tweakable Parameters

In [31]:
PARAMS = dict(
    
    MAX_DOCS=None,                    # for quick code testing - int or None (all docs)
    MIN_LEN=3,                        # words less than this length will be filtered
    MAX_LEN=100,                      # words more than this length will be filtered
    
    DISALLOWED_NERS=[                 # Named-entities to filter out
                                      # See https://github.com/explosion/spaCy/blob/b7ba7f78a28ef71fca60415d0165e27a058d1946/spacy/glossary.py#L318
        'PERSON',
        'GPE',
        'ORG'
    ],
    BIGRAM=False,                     # Form bigrams before creating corpus?
    BIGRAM_MIN_PMI=5,                 # Min. PMI in order to create bigrams (determine by manual inspection of generated bigrams.txt)
    BIGRAM_MIN_FREQ=20,               # Min. freq of co-occurring tokens before they can be considered a bigram

    COMMON_WORDS_MAX_FREQUENCY=10000, # For root words, the max. frequency beyond which they're not useful
    COMMON_WORDS_MAX_DOCS=0.5,        # For root words, max docs (absolute or relative) beyond which they're not useful
    COMMON_WORDS_MIN_DOCS=5,          # For root words, min docs (absolute or relative) beyond which they're not useful

    KEEP_TOKENS=[],                   # Root words to preserve in the vocabulary regardless of their frequency (high or low)


    WORD2VEC_VECTOR_SIZE=200,
    WORD2VEC_WINDOW=10,
    WORD2VEC_EPOCHS=30,

    AFFINITY_N_DOCS=None,
    AFFINITY_DAMPING=0.7,
    AFFINITY_MAX_ITER=400,

    # Parameters that determine initial preference values

    # How many tags to consider (most common to least common)
    AFFINITY_PREFERENCE_N_TAGS=100,

    # How many clusters to form based solely on the most common descriptions (before invoking AffinityModel)
    N_PRECLUSTERS=30,

    # How many documents with common descriptions to use as 'seed' documents for AffinityModel clustering?
    AFFINITY_PREFERENCE_N_COMMON_DESCRIPTIONS=20,

    # Default preference value for non-exemplar docs - None to auto-determine
    AFFINITY_PREFERENCE_DEFAULT=None,

    # Default preference for exemplar docs - None to auto-determine
    AFFINITY_PREFERENCE_EXEMPLAR=0,
)

In [32]:
import sys
from collections import Counter
import textwrap
from glob import glob
from pprint import pprint
import os
import shutil
import logging
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import cycle

import nltk
import pyLDAvis
import pyLDAvis.gensim_models
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Word2Vec
import spacy
from gensim.models.ldamodel import LdaModel
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

from sklearn.cluster import AffinityPropagation
from sklearn import metrics

from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA

from ldamallet import LdaMallet


logging.basicConfig(level=os.environ.get('LOGLEVEL', 'INFO'))
logger = logging.getLogger(__name__)
sns.set()

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


### Useful functions

In [33]:
def process_words(texts, stop_words, disallowed_ners=None, min_len=3, max_len=30):
    # python -m spacy download en_core_web_sm
    # English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
    # Other pipelines at https://spacy.io/models/en
    nlp = spacy.load('en_core_web_sm')

    texts_out = []

    # implement lemmatization and filter out unwanted part of speech tags
    for i, sentence in enumerate(texts):
        doc = nlp(sentence)
        doctext = doc.text
        ents = list(doc.ents)

        if disallowed_ners is not None:
            # Filtering out disallowed NERs should be done prior to splitting the sentence using whitespace.
            disallowed_tokens = []
            for ent in ents:
                if ent.label_ in disallowed_ners:
                    disallowed_tokens.append(ent.text.lower())

        tokens = [token.lemma_ for token in doc]
        # simple_preprocess => lowercase; ignore tokens that are too short or too long
        tokens = [t for t in simple_preprocess(' '.join(tokens), deacc=False, min_len=min_len, max_len=max_len)
                  if t not in stop_words and t not in disallowed_tokens]
        texts_out.append(tokens)

    return texts_out


def plot_word2vec_model(model):
    words = list(model.wv.key_to_index)
    X = model.wv[words]
    pca = PCA(n_components=2)
    result = pca.fit_transform(X)
    plt.scatter(result[:, 0], result[:, 1])
    for i, word in enumerate(words):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))


def vectorize(list_of_docs, model):
    features = np.zeros((len(list_of_docs), model.vector_size))
    for i, tokens in enumerate(list_of_docs):
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if vectors:
            features[i, :] = np.mean(vectors, axis=0)

    return features

def file_to_set(filepath, split_lines=True):
    s = set() 
    lines = open(filepath, 'r').read().splitlines()
    for line in lines:
        line = line.strip().lower()
        if line and not line.startswith('#'):
            if split_lines:
                for t in line.split():
                    s.add(t)
            else:
                s.add(line) 
    return s


def generate_html(docs_df, wv, af, X, output_dir, filename):
    cluster_centers_indices = af.cluster_centers_indices_
    n_clusters_ = len(cluster_centers_indices)
    logger.info(f'Estimated number of clusters: {n_clusters_}')

    n_preclusters = len(docs_df[docs_df.cluster_id != -1].cluster_id.unique())
    # The dataframe representing the docs that af/X see
    model_docs_df = docs_df[docs_df.cluster_id == -1]

    # Shift label ids identified by the AffinityModel up
    labels = n_preclusters + af.labels_

    with open(os.path.join(output_dir, filename), 'w') as f:
        for i in range(n_preclusters):
            _df = docs_df[docs_df.cluster_id == i]
            f.write(f'<hr/><b>Cluster {i} ({len(_df)} docs)</b><hr/>')
            f.write(f'<i>Pre-cluster based on description:</i> {_df.iloc[0].description}<hr/>')
            for _, row in _df.iterrows():
                f.write(f'<a target="_blank" href="{row.url}">{row.pgpid}</a><br/>')

        for label in np.unique(labels):
            docs = np.where(labels == label)[0]
            docs_mean_vector = X[docs].mean(axis=0)
            terms = ', '.join([term for (term, _) in wv.most_similar(docs_mean_vector)])
            f.write(f'<hr/><b>Cluster {label} ({len(docs)} docs)</b><hr/>')
            f.write(f'<i>{terms}</i><hr/>')
            for doc in docs:
                row = model_docs_df.iloc[doc]  # important to use iloc here, not loc
                f.write(f'<a target="_blank" href="{row.url}">{row.pgpid}</a><br/>')
                f.write('Tags: <i>' + str(row.tags) + '</i>')
                if row['is_exemplar']:
                    f.write('<p style="color:red;">' + str(row.description) + '</p>')
                else:
                    f.write('<p>' + str(row.description) + '</p>')

## Unique Run ID

The following logic creates a unique Run ID. Parameter values, input and output files are copied to the `results/<run_ID>` folder. You can manually override the Run ID here if you wish to overwrite results.

In [34]:
existing_runs = sorted([d for d in os.listdir('results') if os.path.isdir(f'results/{d}')])
if existing_runs:
    run_id = int(existing_runs[-1]) + 1
else:
    run_id = 1
# run_id = 42  // Uncomment and specify an explicit Run ID here
run_id = f'{run_id:04}'
logger.info(f'Using Run ID {run_id}. Results will be stored in the results/{run_id} folder.')

output_dir = os.path.join('results', run_id)
os.makedirs(output_dir, exist_ok=True)
logger.addHandler(logging.FileHandler(os.path.join(output_dir, 'log.txt')))
with open(os.path.join(output_dir, 'params.txt'), 'w') as f:
    pprint(PARAMS, f)

_stop_words = []
os.makedirs(f'{output_dir}/stopwords', exist_ok=True)
for filename in glob('stopwords/*.txt'):
    _stop_words.extend(list(file_to_set(filename)))
    shutil.copy(filename, os.path.join(output_dir, filename))
stop_words = _stop_words + nltk.corpus.stopwords.words('english')

_bad_tags = []
os.makedirs(f'{output_dir}/stoptags', exist_ok=True)
for filename in glob('stoptags/*.txt'):
    _bad_tags.extend(list(file_to_set(filename)))
    shutil.copy(filename, os.path.join(output_dir, filename))

INFO:__main__:Using Run ID 0002. Results will be stored in the results/0002 folder.


### Read Descriptions

Add any calculated columns to the Dataframe here.

In [35]:
df = pd.read_csv('descriptions.csv', dtype={'tags': str})[:PARAMS['MAX_DOCS']]
logger.info(f'No. of records = {len(df)}')
df = df.dropna(subset=['description'])
logger.info(f'After dropping records with missing description, no. of records = {len(df)}')
df['tags'] = df['tags'].str.lower()
df['tags'].fillna('', inplace=True)

# -------------- Add additional columns to Dataframe --------------- #
df['cluster_id'] = -1  # will be populated by this script
df['preference'] = 0
df['is_exemplar'] = False
# -------------- Add additional columns to Dataframe --------------- #

INFO:__main__:No. of records = 5000
INFO:__main__:After dropping records with missing description, no. of records = 4967


### Find common descriptions

Very common descriptions should form their own individual clusters ("pre-clusters"). These mostly have descriptions of the form `image missing` / `see Gotein index` etc.

In [36]:
descriptions = Counter()
for i, row in df.iterrows():
    descriptions.update([row.description])

common_descriptions_with_counts = descriptions.most_common(PARAMS['N_PRECLUSTERS'] + PARAMS['AFFINITY_PREFERENCE_N_COMMON_DESCRIPTIONS'])
common_descriptions = [t[0] for t in common_descriptions_with_counts]
for i, desc in enumerate(common_descriptions[:PARAMS['N_PRECLUSTERS']]):
    df.loc[df[df.description == desc].index, 'cluster_id'] = i

df_complete = df.copy()
# Filter these out from further processing
df = df[df.cluster_id == -1]
logger.info(f'After filtering out {PARAMS["N_PRECLUSTERS"]} most common descriptions, no. of records = {len(df)}')

INFO:__main__:After filtering out 30 most common descriptions, no. of records = 4865


### Find common tags

In [37]:
tags = Counter()
tag_dict = {}
for i, row in df.iterrows():
    try:
        _tags = row.tags.split(',')
    except:
        continue
    else:
        for tag in _tags:
            tag = tag.strip().lower().replace(':', '').replace('.', '').replace(';', '').replace('(', '').replace(')', '')
            if tag not in _bad_tags:
                tags.update([tag])

common_tags = [t[0] for t in tags.most_common(PARAMS['AFFINITY_PREFERENCE_N_TAGS'])]

### Create a Word2Vec model

After filtering words using POS tagging, lemmatization etc, we create a Word2Vec model with a given vector size, window length etc.

In [38]:
data_pkl_file = os.path.join(output_dir, 'data.pik')
if not os.path.exists(data_pkl_file):
    data = list(df.description)
    data = process_words(
        data,
        stop_words=stop_words,
        disallowed_ners=PARAMS['DISALLOWED_NERS'],
        min_len=PARAMS['MIN_LEN'],
        max_len=PARAMS['MAX_LEN']
    )
    logger.info(f'After filtering stopwords/short words/lemmatization, no. of records = {len(data)}')

    with open(data_pkl_file, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
else:
    with open(data_pkl_file, 'rb') as f:
        data = pickle.load(f)

if not os.path.exists(os.path.join(output_dir, 'wvmodel.bin')):
    model = Word2Vec(
        sentences=data,
        vector_size=PARAMS['WORD2VEC_VECTOR_SIZE'],
        workers=8,
        sg=1,
        window=PARAMS['WORD2VEC_WINDOW'],
        epochs=PARAMS['WORD2VEC_EPOCHS']
    )
    model.save(os.path.join(output_dir, 'wvmodel.bin'))
    with open(os.path.join(output_dir, 'wvmodel_keys.txt'), 'w') as f:
        f.write('\n'.join(model.wv.index_to_key))
else:
    model = Word2Vec.load(os.path.join(output_dir, 'wvmodel.bin'))

wv = model.wv

INFO:__main__:After filtering stopwords/short words/lemmatization, no. of records = 4865
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:collected 9791 word types from a corpus of 112458 raw words and 4865 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 2503 unique words (25.564293739148198%% of original 9791, drops 7288)', 'datetime': '2022-06-07T08:46:06.826728', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]', 'platform': 'Linux-4.18.0-348.23.1.el8_5.x86_64-x86_64-with-glibc2.17', 'event': 'prepare_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 101359 word corpus (90.13053762293478%% of original 112458, drops 11099)', 'datetime': '2022-06-07T08:46:06.829639', 'gensim': 

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 5 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 4 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:EPOCH 7 - PROGRESS: at 85.24% examples, 58104 words/s, in_qsize 2, out_qsize 1
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 7 : training on 112458 raw words (84795 effective words) took 1.2s, 69215 effective words/s
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 7 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 6 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 5 more

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 6 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 5 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 4 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:EPOCH 16 - PROGRESS: at 85.24% examples, 56962 words/s, in_qsize 2, out_qsize 1
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 16 : training on 112458 raw words (84931 effective words) took 1.2s, 68390 effective words/s
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 7 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 6 mo

INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 7 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 6 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 5 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 4 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads
INFO:gensim.models.word2vec:EPOCH 25 - PROGRESS: at 85.24% examples, 58571 words/s, in_qsize 2, out_qsize 1
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 2 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 1 more threads
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 0 more threads
INFO:gensim.models.word2vec:EPOCH - 25 : training on 112458 raw words (84938 effective words) took 1.2s, 69614 effective words/s
INFO:gensim.models.word2vec:worker thread finished; awaiting finish of 7 mo

### Run AffinityPropagation (first pass)

We run AffinityPropagation once to determine the final converged preference values. This will allow us to set the *initial preference* of the documents and the exemplars to suitable values such that exemplars are likely to end up as centers of clusters.

For a detailed explanation, see http://genes.toronto.edu/affinitypropagation/faq.html#clusters

In [39]:
n_docs = PARAMS['AFFINITY_N_DOCS']
X = vectorize(data[:n_docs], model=model)
logger.info(f'Vectorization of {n_docs} documents done.')

_default_preference = PARAMS['AFFINITY_PREFERENCE_DEFAULT']
_exemplar_preference = PARAMS['AFFINITY_PREFERENCE_EXEMPLAR']

if _default_preference is None and _exemplar_preference is None:
    logger.info('Fitting AffinityPropagation model to documents..')
    af = AffinityPropagation(verbose=True, damping=PARAMS['AFFINITY_DAMPING']).fit(X)
    generate_html(df_complete, wv, af, X, output_dir, 'affinity_clustering.html')

    # If we didn't specify starting preference values, it would have been the median (for all data points):
    median_preference = np.median(af.affinity_matrix_)
    logger.info(f'Median Preference Value of AF model: {median_preference}')
    min_preference = np.min(af.affinity_matrix_)
    logger.info(f'Min Preference Value of AF model: {min_preference}')
    max_preference = np.max(af.affinity_matrix_)
    logger.info(f'Max Preference Value of AF model: {max_preference}')

    exemplar_preference = max_preference
    df['preference'] = min_preference - (max_preference - min_preference)
else:
    exemplar_preference = _exemplar_preference
    df['preference'] = _default_preference

for common_tag in common_tags:
    matching_indices = np.where(df.tags.str.startswith(common_tag))[0]
    if len(matching_indices) > 0:
        randomly_selected_doc_index = np.random.choice(matching_indices, 1)[0]
        logger.info(f'Using doc id {randomly_selected_doc_index} as exemplar based on commonly found tag = {common_tag}')
        df.at[randomly_selected_doc_index, 'preference'] = exemplar_preference
        df.at[randomly_selected_doc_index, 'is_exemplar'] = True

for common_description in common_descriptions:
    matching_indices = np.where(df.description == common_description)[0]
    if len(matching_indices) > 0:
        randomly_selected_doc_index = np.random.choice(matching_indices, 1)[0]
        logger.info(f'Using doc id {randomly_selected_doc_index} as exemplar based on commonly found description = {common_description}')
        df.at[randomly_selected_doc_index, 'preference'] = exemplar_preference
        df.at[randomly_selected_doc_index, 'is_exemplar'] = True

logger.info(f'Total no. of exemplars set = {len(df[df.is_exemplar==True])}')

INFO:__main__:Vectorization of None documents done.
INFO:__main__:Using doc id 3444 as exemplar based on commonly found tag = 
INFO:__main__:Using doc id 1076 as exemplar based on commonly found tag = communal
INFO:__main__:Using doc id 2612 as exemplar based on commonly found tag = trade
INFO:__main__:Using doc id 2167 as exemplar based on commonly found tag = illness letter 969–1517
INFO:__main__:Using doc id 1013 as exemplar based on commonly found tag = 11th c
INFO:__main__:Using doc id 798 as exemplar based on commonly found tag = account
INFO:__main__:Using doc id 4043 as exemplar based on commonly found tag = india
INFO:__main__:Using doc id 1512 as exemplar based on commonly found tag = marriage
INFO:__main__:Using doc id 3363 as exemplar based on commonly found tag = ketubba
INFO:__main__:Using doc id 2104 as exemplar based on commonly found tag = petition
INFO:__main__:Using doc id 1401 as exemplar based on commonly found tag = poverty
INFO:__main__:Using doc id 133 as exempl

INFO:__main__:Using doc id 4707 as exemplar based on commonly found description = India Book 4 (Hebrew description below; English to come)
INFO:__main__:Using doc id 4025 as exemplar based on commonly found description = Awaiting description - see Goitein notes linked below.
INFO:__main__:Using doc id 4035 as exemplar based on commonly found description = The widow of Abu al-Barakat, the son of Yosef Lebdi, buys a sixth of two adjacent stores for 53 and three/fourths dinars. The widow was Sitt al-Sada, daughter of Abu Nasr al-Tinnisi. This same share of the two stores was bought at an earlier date by Abu al-Fadil, a physician, from his two nephews. These nephews retained the right to buy.
INFO:__main__:Using doc id 4080 as exemplar based on commonly found description = Panegyric in Honor of Madmun II Cairo, after 1186.
INFO:__main__:Using doc id 4121 as exemplar based on commonly found description = Awaiting description.
INFO:__main__:Using doc id 4128 as exemplar based on commonly fou

### Re-run AffinityPropagation after setting initial preference values

In [40]:
logger.info(f'Recreating AffinityPropagation after setting preference={exemplar_preference} for exemplars')

af = AffinityPropagation(
    verbose=True,
    preference=df['preference'].to_numpy(),
    damping=PARAMS['AFFINITY_DAMPING'],
    max_iter=PARAMS['AFFINITY_MAX_ITER']
).fit(X)

generate_html(df_complete, wv, af, X, output_dir, 'affinity_clustering_with_preferences.html')
with open(os.path.join(output_dir, 'labels.pik'), 'wb') as f:
    pickle.dump(af.labels_, f, pickle.HIGHEST_PROTOCOL)

INFO:__main__:Recreating AffinityPropagation after setting preference=0 for exemplars
INFO:__main__:Estimated number of clusters: 0


Did not converge


### Analyze clustering results

In [41]:
from IPython.display import Markdown as md

md(f"The resulting clusters are saved in the folder [results/{run_id}](results/{run_id}/affinity_clustering_with_preferences.html)")

The resulting clusters are saved in the folder [results/0002](results/0002/affinity_clustering_with_preferences.html)