In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from clustering_traditional import TraditionalClustering
from clustering_statistics import get_cluster_statistics
from clustering_word2vec import Word2VecModel

In [None]:
# An example how to perform traditional list-based clustering of semantic VFT

# create the clustering object
cl = TraditionalClustering()

# initialize by loading the animal word lists
cl.initialize_semantic_list(filename="database/de/animal_categories.csv")

# read the words from a semantic VFT
testfile: str = "example/de/semantic.txt"
f = open(testfile)
words = f.readlines()
f.close()

# remove trailing \n from the words and convert to pd.DataFrame
words = list(map(lambda s:s.replace("\n", ""), words))
words = pd.DataFrame({"word":words})

# find clusters
words = cl.calculate_clusterids_semantic(words)

# print the output
display(words)

# analyze cluster characteristics
display(get_cluster_statistics(words))

# clean up
del cl, testfile, f, words


In [None]:
# An example how to perform traditional rule-based clustering of phonematic VFT

# create the clustering object
cl = TraditionalClustering()

# initialize by loading the list of phonematic word pairs
cl.initialize_phonematic_list(filename="database/de/phonematic_pairs.csv")

# read the words from a semantic VFT
testfile: str = "example/de/phonematic.txt"
f = open(testfile)
words = f.readlines()
f.close()

# remove trailing \n from the words and convert to pd.DataFrame
words = list(map(lambda s:s.replace("\n", ""), words))
words = pd.DataFrame({"word":words})

# find clusters
words = cl.calculate_clusterids_phonematic(words)

# print the output
display(words)

# analyze cluster characteristics
display(get_cluster_statistics(words))

# clean up
del cl, testfile, f, words


In [None]:
# An example how to perform word2vec-based clustering; you MUST replace the file name for the model.load_wv line with a correct model file!

# create and initialize the Word2Vec Model
Word2VecModel.set_logging_info()
model = Word2VecModel()
model.load_wv("eswiki-20221020-pages-articles.kv") # <-- change model file here

# semantic relatedness threshold used for identifying clusters, use threshold 0.4 for semantic VFT and 0.3 for phonematic VFT
threshold = 0.4

# read the words from a semantic VFT
testfile: str = "example/es/semantic.txt"
f = open(testfile)
words = f.readlines()
f.close()

# remove trailing \n from the words and convert to pd.DataFrame
words = list(map(lambda s:s.replace("\n", ""), words))
words = pd.DataFrame({"word":words})

# find clusters
words["cluster"], words["relatedness_before"] = model.calculate_clusterids(words, clustering_type="fixed_chain", sim_threshold=threshold)

# print the output
display(words)

# analyze cluster characteristics
display(get_cluster_statistics(words))
display("mean sequential relatedness: " + str(model.calculate_mean_seqrel_total(words["word"])))

# plot the pairwise semantic relatedness
# create the figure and axis
fig, ax = plt.subplots(1, 1, figsize=(15,5))

# create a DataFrame which stores all word pairs with their semantic relatedness
pairs = pd.DataFrame(columns=["pair", "cosine"])
for i in range(0, words.shape[0]-1):
    word1 = words.loc[i, "word"]
    word2 = words.loc[i+1, "word"]
    pairs = pd.concat([pairs, pd.DataFrame({"pair":[word1 + " & " + word2], "cosine":[round(model.cosine_similarity(word1, word2),2)]})])
pairs.reset_index(inplace=True)

# now plot as scatter + line
ax.scatter(x=pairs.index, y=pairs["cosine"], label="pairwise semantic relatedness")
ax.plot(pairs.index, pairs["cosine"])

# horizontal line for clustering threshold
ax.hlines(threshold, -0.1, words.index.max() + 2, colors="black", linestyles="dashed", label="cluster threshold")

# some formatting
ax.set_xlim(-0.1, words.index.max() + 2)
ax.set_ylabel("semantic relatedness")
ax.set_xlabel("pair index")
ax.legend()

# annotate the plot with all words
for i in pairs.index:
    ax.annotate(text=pairs.loc[i, "pair"], xy=(pairs.index[i] + 0.1, pairs.loc[i, "cosine"] + 0.01), color="black")

# clean up
del model.model.wv, model.model, model, testfile, f, words, fig, ax, pairs, word1, word2, i, threshold
