In [None]:
import pandas as pd
import numpy as np
import umap.umap_ as umap

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
def cluster_vad_words(df: pd.DataFrame,
                      language: str,
                      result_type: str,
                      write_dir: str = 'human_results_clustering') -> None:
    """
    Cluster words based on VAD scores and write to file.
    """

    # Get target words and embeddings
    words = df.index.values
    embeddings = np.stack([df.loc[word].values for word in words])

    # Examine best number of clusters
    max_, max_idx = 0, 0

    # Iterate through number of clusters
    for i in range(5, 11):

        kmeans = KMeans(n_clusters=i, random_state=0).fit(embeddings)
        score = silhouette_score(embeddings, kmeans.labels_, metric='cosine')
        
        if score > max_:
            max_ = score
            max_idx = i

    # Fit kmeans with best number of clusters
    kmeans = KMeans(n_clusters=max_idx, random_state=0, n_init="auto").fit(embeddings)

    # Create string to write to file
    write_string = f'{language}_{result_type}: {max_idx} clusters'

    # Iterate through clusters and collect sentences in each cluster
    for i in range(max_idx):
        write_string += f'\nCluster {i}: {len([j for j in kmeans.labels_ if j == i])/len(kmeans.labels_)*100:.2f}%'
        for j in range(len(kmeans.labels_)):
            if kmeans.labels_[j] == i:
                write_string += f'\n\t{j+1}: {words[j]}'

    # Write to file
    with open(f'{write_dir}/{language}_{result_type}_clusters.txt', 'w') as f:
        f.write(write_string)

    # UMAP for dimensionality reduction
    reducer = umap.UMAP(n_components=2, metric='cosine', random_state=0)
    umap_embedding = reducer.fit_transform(embeddings)

    # Create tsv with word, dim1, dim2, cluster
    tsv_string = 'word\tdim1\tdim2\tcluster\n'

    for i in range(len(words)):
        tsv_string += f'{words[i]}\t{umap_embedding[i][0]}\t{umap_embedding[i][1]}\t{kmeans.labels_[i]}\n'

    with open(f'{write_dir}/{language}_{result_type}_clusters.tsv', 'w') as f:
        f.write(tsv_string)

In [None]:
def preprocess_wordlist(human_df: pd.DataFrame,
                        exclude_words: list,
                        include_words: list) -> list:
    """
    Preprocess wordlist to remove non-words and duplicates.
    """

    columns = human_df.columns

    words = [human_df[col].values for col in columns]
    words = [word for sublist in words for word in sublist]
    words = [word.lower().strip().replace(' ','') for word in words if type(word) == str]
    words = [word for word in words if '/' not in word and word not in exclude_words]
    words = words + include_words
    words = list(set(words))

    return words

In [None]:
# Read in VAD word norms - see https://saifmohammad.com/WebPages/nrc-vad.html
word_norms = pd.read_csv('./NRC-VAD-Lexicon.txt', sep='\t', header=None, names=['Word', 'Valence', 'Arousal', 'Dominance'])
word_norms = word_norms.set_index('Word')
word_norms.head()

In [None]:
data_sources = [
    'english_humans_most_associated.csv',
    'english_humans_uniquely_associated.csv',
    'nepali_humans_most_associated.csv',
    'nepali_humans_uniquely_associated.csv',
]

exclude_lists = [
    ['maluable','excersise','influentiable', 'judgemental'],
    ["party's",'college-admissons','self-concious','adolscence'],
    [],
    []
]

include_lists = [
    ['family', 'parents', 'malleable', 'exercise', 'influenceable', 'judgmental'],
    ['parties','college-admissions','self-conscious','adolescence'],
    [],
    []
]

In [None]:
for i in range(len(data_sources)):

    data_source = data_sources[i]
    language = data_source.split('_')[0]
    result_type = data_source.split('_')[2]

    human_data_df = pd.read_csv(f'./human_data/{data_sources[i]}')

    words = preprocess_wordlist(human_data_df, exclude_lists[i], include_lists[i])
    words = [word for word in words if word in word_norms.index]
    word_features = word_norms.loc[words]

    cluster_vad_words(word_features, language, result_type)