In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from itertools import chain
import matplotlib.pyplot as plt
import warnings
from sklearn.manifold import TSNE
import seaborn as sns
import random
import os
warnings.filterwarnings("ignore")

## Read Data and Functions

In [None]:
## read occupation and map
occ = pd.read_csv("devaluation-main/data/dimension and mapping/occupation_map.csv")
occ = occ[['Occupation, 1950 basis','Single words']]
occ = occ.loc[occ['Single words'].notna()].reset_index(drop=True)

In [None]:
## read embeddings from class
os.chdir('devaluation-main/code/measures')
%run embeddings.py
%run 3CosAdd.py

## Generate Embedding-based Cultural Measures

### Ngram

In [None]:
## get centroid difference
data = pd.DataFrame()
os.chdir('directory of your Ngram embeddings')

## iterate over years
for year in range(1900,2010,10):
    
    ## load embeddings from a given year
    embeddings = Embeddings.from_file(year,self_train=True)
    
    ## index embeddings by a list
    occupation = occ["Single words"]
    
    ## load dimension words
    dimension = load_dimension("devaluation-main/data/dimension and mapping/dimension-words.txt")
    
    ## assign distances
    occ['year'] = year
    occ['gender'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "gender"),embeddings,occupation)
    occ['prestige'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "prestige"),embeddings,occupation)
    occ['education'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "education"),embeddings,occupation)
    occ['income'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "income"),embeddings,occupation)
    occ['evaluation'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "evaluation"),embeddings,occupation)
    occ['potency'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "potency"),embeddings,occupation)
    occ['activity'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "activity"),embeddings,occupation)
    data = data.append(occ)
    
    ## monitor progress
    print(str(year) + " is done!")

In [None]:
## save results
os.chdir('devaluation-main/data/census and merged')
data.to_csv("ngram.csv", index=False)

### COCHA

In [None]:
## get centroid difference
data = pd.DataFrame()
os.chdir('directory of your COCHA embeddings')

## iterate over years
for year in range(1900,2020,10):
    
    ## load embeddings from a given year
    embeddings = Embeddings.from_file(year,self_train=True)
    
    ## index embeddings by a list
    occupation = occ["Single words"]
    
    ## load dimension words
    dimension = load_dimension("devaluation-main/data/dimension and mapping/dimension-words.txt")
    
    ## assign distances
    occ['year'] = year
    occ['gender'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "gender"),embeddings,occupation)
    occ['prestige'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "prestige"),embeddings,occupation)
    occ['education'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "education"),embeddings,occupation)
    occ['income'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "income"),embeddings,occupation)
    occ['evaluation'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "evaluation"),embeddings,occupation)
    occ['potency'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "potency"),embeddings,occupation)
    occ['activity'] = cosine_sim_average(get_dimension(embeddings,dimension,category = "activity"),embeddings,occupation)
    data = data.append(occ)
    
    ## monitor progress
    print(str(year) + " is done!")

In [None]:
## save results
os.chdir('devaluation-main/data/census and merged')
data.to_csv("COCHA.csv", index=False)

### t-SNE Visualization

In [None]:
## read data
os.chdir('directory of your Ngram embeddings')
embeddings = Embeddings.from_file(2000,self_train=True)
occupation = occ["Single words"]
category = pd.read_csv('devaluation-main/data/miscellaneous/occ_category.csv')

In [None]:
## merge category with the main occupation list
category = category.loc[category.category.notna()]
category = category.drop_duplicates("occupation_single")
category = pd.merge(pd.DataFrame(occ["Single words"]),category,right_on="occupation_single",left_on="Single words",how="left")

In [None]:
## load embeddings for the occupations
occupation_1 = occupation.str.split(", ",expand=True)[0]
occupation_2 = occupation.str.split(", ",expand=True)[1]
occupation_3 = occupation.str.split(", ",expand=True)[2]
occupation_4 = occupation.str.split(", ",expand=True)[3]
occupation = np.array([embeddings[occupation_1],embeddings[occupation_2],embeddings[occupation_3],embeddings[occupation_4]])
occupation = np.nanmean(occupation,axis=0)

In [None]:
## some cleanings
occupation_single = category['occupation_single'][np.isnan(occupation).sum(axis=1)==0]
occupation_origin = category['occupation_origin'][np.isnan(occupation).sum(axis=1)==0]
category = category['category'][np.isnan(occupation).sum(axis=1)==0]
occupation = occupation[np.isnan(occupation).sum(axis=1)==0]
occupation = occupation[~category.isna()]
occupation_origin = occupation_origin[~category.isna()]
occupation_single = occupation_single[~category.isna()]
category = category[~category.isna()]

In [None]:
## create tSNE measures on two dimensions
n_components = 2
tsne = TSNE(n_components,perplexity=10,random_state=2023)
tsne_result = tsne.fit_transform(occupation)
tsne_result.shape
tsne_result_df_2000 = pd.DataFrame({'tsne_1': tsne_result[:,0], 'tsne_2': tsne_result[:,1], 
                                    'label': category, 'occupation_origin': occupation_origin,
                                    'occupation_single': occupation_single})

In [None]:
## output `tsne_result_df_2000` should be saved in a 
## csv file with the two dimensions to be plotted in a scatter plot with labels

### 3CosAdd Validation

In [None]:
os.chdir('directory of your Ngram embeddings')

## load embeddings from a given year
embeddings = Embeddings.from_file(2000,self_train=True)

In [262]:
# Find the neighbors of doctor - gender
vecs = embeddings["doctor","man","woman"]
print(get_closest_words(embeddings, vecs[0:1] - vecs[1:2] + vecs[2:3], k=4))

[['doctor', 'gynecologist', 'nurse', 'physician']]


## Measures with Weights Proportional to Occupation Occurrences

### Ngram

In [69]:
import requests
import json
from tqdm import tqdm
import time
from statistics import mode

In [None]:
## calculate weights in 2000-2009
occupation = occ["Single words"]
occupation_1 = occupation.str.split(", ",expand=True)[0]
occupation_2 = occupation.str.split(", ",expand=True)[1]
occupation_3 = occupation.str.split(", ",expand=True)[2]
occupation_4 = occupation.str.split(", ",expand=True)[3]
weight_1 = []
weight_2 = []
weight_3 = []
weight_4 = []
for i in tqdm(range(len(occupation_1))):
    weight_1.append(occ_freq(occupation_1[i],2000,2009))
for i in tqdm(range(len(occupation_2))):
    weight_2.append(occ_freq(occupation_2[i],2000,2009))
for i in tqdm(range(len(occupation_3))):
    weight_3.append(occ_freq(occupation_3[i],2000,2009))
for i in tqdm(range(len(occupation_4))):
    weight_4.append(occ_freq(occupation_4[i],2000,2009))

In [None]:
## get centroid difference
data = pd.DataFrame()
os.chdir('directory of your Ngram embeddings')

## iterate over years
for year in range(1900,2010,10):
    
    ## load embeddings from a given year
    embeddings = Embeddings.from_file(year,self_train=True)
    
    ## index embeddings by a list
    occupation = occ["Single words"]
    
    ## load dimension words
    dimension = load_dimension("devaluation-main/data/dimension and mapping/dimension-words.txt")
    
    ## assign distances
    occ['year'] = year
    occ['gender'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "gender"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    occ['prestige'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "prestige"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    occ['education'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "education"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    occ['income'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "income"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    occ['evaluation'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "evaluation"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    occ['potency'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "potency"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    occ['activity'] = cosine_sim_average_weight(get_dimension(embeddings,dimension,category = "activity"),embeddings,occupation,weight_1,weight_2,weight_3,weight_4)
    data = data.append(occ)
    
    ## monitor progress
    print(str(year) + " is done!")

In [None]:
os.chdir('devaluation-main/data/census and merged')
data.to_csv("ngram_weight.csv", index=False)