In [None]:
import pandas as pd
import numpy as np
import torchtext
from scipy.stats import pearsonr

In [None]:
# Load the human data - English
english_ratings = pd.read_csv('./human_data/english_word_ratings.csv')
english_ratings.columns = [i.split('[')[1][:-1] for i in english_ratings.columns]
english_ratings.head()

In [None]:
# Compute means of human ratings for each word
english_means = pd.Series([np.mean(english_ratings[col]) for col in english_ratings.columns])

In [None]:
# FastText 2M embeddings - English
# Compute cosine similarities for the words rated by humans
vectors = torchtext.vocab.Vectors(name='crawl-300d-2M.vec', cache='.vector_cache')
teen_emb = vectors.get_vecs_by_tokens('teenager').numpy()
model_word_embs = np.stack([vectors.get_vecs_by_tokens(word.lower()).numpy() for word in english_ratings.columns])
teen_model_cos_sims = np.dot(model_word_embs, teen_emb) / (np.linalg.norm(model_word_embs, axis=1) * np.linalg.norm(teen_emb))

# Write human means and model cosine similarities to a tsv file
df = pd.DataFrame({'human_means': english_means, 'teen_model_cos_sims': teen_model_cos_sims})
df.to_csv('human_ai_comparison_results/english_ft_human_ai_sims.tsv', sep='\t', index=False)

# Compute Pearson correlation between human means and model cosine similarities
corr, p = pearsonr(english_means, teen_model_cos_sims)
print(f'Pearson correlation: {corr:.2f}, p-value: {p:.2f}')

In [None]:
# GloVe 840B embeddings - English
# Compute cosine similarities for the words rated by humans
vectors = torchtext.vocab.Vectors(name='glove.840B.300d.txt', cache='.vector_cache')
teen_emb = vectors.get_vecs_by_tokens('teenager').numpy()
model_word_embs = np.stack([vectors.get_vecs_by_tokens(word.lower()).numpy() for word in english_ratings.columns])
teen_model_cos_sims = np.dot(model_word_embs, teen_emb) / (np.linalg.norm(model_word_embs, axis=1) * np.linalg.norm(teen_emb))

# Write human means and model cosine similarities to a tsv file
df = pd.DataFrame({'human_means': english_means, 'teen_model_cos_sims': teen_model_cos_sims})
df.to_csv('human_ai_comparison_results/english_glove_human_ai_sims.tsv', sep='\t', index=False)

# Compute Pearson correlation between human means and model cosine similarities
corr, p = pearsonr(english_means, teen_model_cos_sims)
print(f'Pearson correlation: {corr:.2f}, p-value: {p:.2f}')

In [None]:
# Load the human data - Nepali
nepali_ratings = pd.read_csv('./human_data/nepali_word_ratings.csv')
nepali_ratings.columns = [i.split('[')[1][:-1] for i in nepali_ratings.columns]
nepali_ratings.head()

In [None]:
# Compute means of human ratings for each word
nepali_means = pd.Series([np.mean(nepali_ratings[col]) for col in nepali_ratings.columns])

In [None]:
# Use translations of the English words to Nepali, rather than spreadsheet English
nepali_translation_dict = {
    'rebellious': 'विद्रोही',
    'energetic': 'ऊर्जावान',
    'moody': 'मुडी',
    'curious': 'उत्सुक',
    'independent': 'स्वतन्त्र',
    'social': 'सामाजिक',
    'impulsive': 'आकस्मिक',
    'creative': 'सृजनात्मक',
    'confident': 'आत्मविश्वासी',
    'thoughtful': 'विचारशील',
    'carefree': 'चिन्तामुक्त',
    'adventurous': 'साहसी',
    'inquisitive': 'जिज्ञासु',
    'emotional': 'भावनात्मक',
    'restless': 'बेचैन',
    'innovative': 'अभिनव',
    'idealistic': 'आदर्शवादी',
    'opinionated': 'विचारशील',
    'resourceful': 'संसाधन',
    'influential': 'प्रभावशाली'
}

In [None]:
# FastText CC embeddings - Nepali
# Compute cosine similarities for the words rated by humans
vectors = torchtext.vocab.Vectors(name='cc.ne.300.vec', cache='.vector_cache')
teen_emb = vectors.get_vecs_by_tokens('किशोर').numpy()

for engl, word in nepali_translation_dict.items():
    if word not in vectors.stoi:
        print(engl)

model_word_embs = np.stack([vectors.get_vecs_by_tokens(nepali_translation_dict[word.lower()]).numpy() for word in nepali_ratings.columns])
teen_model_cos_sims = np.dot(model_word_embs, teen_emb) / (np.linalg.norm(model_word_embs, axis=1) * np.linalg.norm(teen_emb))

# Remove embedding NaN values
nepali_means = nepali_means[~np.isnan(teen_model_cos_sims)]
teen_model_cos_sims = teen_model_cos_sims[~np.isnan(teen_model_cos_sims)]

# Write human means and model cosine similarities to a tsv file
df = pd.DataFrame({'human_means': nepali_means, 'teen_model_cos_sims': teen_model_cos_sims})
df.to_csv('human_ai_comparison_results/nepali_ftcc_human_ai_sims.tsv', sep='\t', index=False)

# Compute Pearson correlation between human means and model cosine similarities
corr, p = pearsonr(nepali_means, teen_model_cos_sims)
print(f'Pearson correlation: {corr:.2f}, p-value: {p:.2f}')

In [None]:
# Reset Nepali means
nepali_means = pd.Series([np.mean(nepali_ratings[col]) for col in nepali_ratings.columns])

In [None]:
# GloVe 840B embeddings - Nepali
# Compute cosine similarities for the words rated by humans
vectors = torchtext.vocab.Vectors(name='nepali_glove_vectors.txt', cache='.vector_cache')
teen_emb = vectors.get_vecs_by_tokens('किशोर').numpy()

for engl, word in nepali_translation_dict.items():
    if word not in vectors.stoi:
        print(engl)

model_word_embs = np.stack([vectors.get_vecs_by_tokens(nepali_translation_dict[word.lower()]).numpy() for word in nepali_ratings.columns])
teen_model_cos_sims = np.dot(model_word_embs, teen_emb) / (np.linalg.norm(model_word_embs, axis=1) * np.linalg.norm(teen_emb))

# Remove embedding NaN values
nepali_means = nepali_means[~np.isnan(teen_model_cos_sims)]
teen_model_cos_sims = teen_model_cos_sims[~np.isnan(teen_model_cos_sims)]

# Write human means and model cosine similarities to a tsv file
df = pd.DataFrame({'human_means': nepali_means, 'teen_model_cos_sims': teen_model_cos_sims})
df.to_csv('human_ai_comparison_results/nepali_glove_human_ai_sims.tsv', sep='\t', index=False)

# Compute Pearson correlation between human means and model cosine similarities
corr, p = pearsonr(nepali_means, teen_model_cos_sims)
print(f'Pearson correlation: {corr:.2f}, p-value: {p:.2f}')