In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

pd.options.mode.chained_assignment = None

First, we generate df with all the dataset and w_age_df as a subset of df that contains notnull value for age_of_acquisition.

In [5]:
df = pd.read_csv("data/df_all_raw.csv")
df.columns = list(map(str.lower, df.columns))
df = df.drop(columns=['dataset_id', 'form_id', 'form', 'gloss_in_source', 'iso639p3code', 'mrc_word', 'kucera_francis_frequency'])
w_age_df = df[df['age_of_acquisition'].notnull()]

print(f"Number of rows in the original df: {df.shape[0]}")
num_languages = df['variety'].nunique()
print(f"Number of unique languages in the original df: {num_languages}")

print("=====================================================================================================")

print(f"Number of rows in the df with age of acquisition: {w_age_df.shape[0]}")
num_languages = w_age_df['variety'].nunique()
print(f"Number of unique languages in the df with age of acquisition: {num_languages}")
age_of_acquisition_per_concept = w_age_df.groupby('concepticon_gloss')['age_of_acquisition'].first().reset_index()
print('Number of concepts in the df with age of acquisition:', age_of_acquisition_per_concept.shape[0])

  df = pd.read_csv("data/df_all_raw.csv")


Number of rows in the original df: 1390594
Number of unique languages in the original df: 3050
Number of rows in the df with age of acquisition: 330160
Number of unique languages in the df with age of acquisition: 3016
Number of concepts in the df with age of acquisition: 486


Second, we calculate colexification count.

In [6]:
def per_lang_colexification(curr_df):
    """
    Calculate the colexification frequency of pairs of concepts present in the current language.
    """
    all_combos_dict = {}
    # We iterate through each row, which has the concepts associated with a specific word
    for i, row in curr_df.iterrows():
        # Get the current set of concepts
        a = row['concepticon_gloss']
        # Create all possible unique combinations of concepts, where each pair is alphabetically sorted
        combos = list(set(map(lambda x: tuple(sorted(x)), product(a, a))))
        # Ensure the concepts in the pair are not identical
        combos = [combo for combo in combos if combo[0] != combo[1]]
        # Add counts for a pair of combinations being colexified
        for combo in combos:
            if combo in all_combos_dict:
                all_combos_dict[combo] += 1
            else:
                all_combos_dict[combo] = 1

    # Create a DataFrame out of our dictionary and return the colexification counts for two concepts
    tmp = pd.DataFrame.from_dict(all_combos_dict, "index").reset_index()
    per_lang = pd.DataFrame(tmp['index'].tolist(), columns=['concept_1', "concept_2"])
    per_lang['colexification_count'] = tmp[0]
    return per_lang

def main():
    all_dfs = []
    for variety in tqdm(w_age_df['variety'].unique()):
        sub = w_age_df[w_age_df['variety'] == variety]
        agg = sub.groupby("clics_form")[['concepticon_gloss', 'concepticon_id']].agg(list)
        agg['num_concepts'] = agg['concepticon_gloss'].apply(lambda x: len(set(x)))
        colex = agg[agg['num_concepts']>1]
        colex['concepticon_gloss'] = colex['concepticon_gloss'].apply(lambda x: sorted(list(set(x))))
        # We skip any language where no concepts are colexified
        if colex.shape[0] == 0:
            continue
        curr_df = per_lang_colexification(colex)
        all_dfs.append(curr_df)
    mega = pd.concat(all_dfs)
    colex_counts = mega.groupby(["concept_1", "concept_2"]).sum().reset_index()
    return colex_counts

In [7]:
colex_counts = main()

100%|██████████████████████████████████████████████████████████████████████████████| 3016/3016 [02:22<00:00, 21.19it/s]


In [9]:
colex_counts.sort_values(by='colexification_count')
colex_with_age = pd.merge(colex_counts, age_of_acquisition_per_concept, left_on='concept_1', right_on='concepticon_gloss')
colex_with_age = pd.merge(colex_with_age, age_of_acquisition_per_concept, left_on='concept_2', right_on='concepticon_gloss', suffixes=('_1', '_2'))
merged_data = colex_counts.merge(
    age_of_acquisition_per_concept, left_on='concept_1', right_on='concepticon_gloss', how='left'
).merge(
    age_of_acquisition_per_concept, left_on='concept_2', right_on='concepticon_gloss', how='left', suffixes=('_1', '_2')
)

merged_data['avg_age_acquisition'] = merged_data[['age_of_acquisition_1', 'age_of_acquisition_2']].mean(axis=1)
merged_data['age_acquisition_diff'] = abs(merged_data['age_of_acquisition_1'] - merged_data['age_of_acquisition_2'])
merged_data = merged_data.drop(columns=['age_of_acquisition_1', 'age_of_acquisition_2', 'concepticon_gloss_1', 'concepticon_gloss_2', 'avg_age_acquisition'])
merged_data.head()

Unnamed: 0,concept_1,concept_2,colexification_count,age_acquisition_diff
0,ADULTERY,DECEIT,1,64.0
1,ADULTERY,WALK,1,377.0
2,AFTERNOON,AUTUMN,1,3.0
3,AFTERNOON,COLD,1,98.0
4,AFTERNOON,EVENING,17,22.0


In [14]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3662 entries, 0 to 3661
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   concept_1             3662 non-null   object 
 1   concept_2             3662 non-null   object 
 2   colexification_count  3662 non-null   int64  
 3   age_acquisition_diff  3662 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 114.6+ KB


In [15]:
merged_data.to_csv('data/aoa_colex.csv')