# Language Selection

In [1]:
import pandas as pd
import scipy.stats as stats

In [2]:
lang_data = pd.read_csv('../data_collection/lang_data.csv')

In [4]:
# Filter out languages with empty "ttr" values
ttr_langs = lang_data[lang_data['ttr'].notna()]

In [5]:
ttr_langs.to_csv("ttr_langs.csv", index=False)

In [6]:
# Distribution of "script" values
ttr_langs["genus"].value_counts()

Slavic             7
Romance            6
Germanic           4
Chinese            2
Finnic             2
Indic              2
Turkic             2
Greek              1
Basque             1
Semitic            1
Ugric              1
Malayo-Sumbawan    1
Japanese           1
Korean             1
Iranian            1
Vietic             1
Name: genus, dtype: int64

In [11]:
# Range of "ttr" values
ttr_langs["ttr"].min(), ttr_langs["ttr"].max()

(0.1668725, 0.5807387500000001)

In [12]:
lang_data["ttr"].min(), lang_data["ttr"].max()

(0.1668725, 0.5807387500000001)

## Comparison of TTR Values

In [2]:
ttr_langs = pd.read_csv("ttr_langs.csv")

In [3]:
ttr_langs

Unnamed: 0,lang,iso_code,wals_code,script,variety,name,family,genus,d_syn,train_frac_estimate,...,lh,lh_sd,is,is_sd,mfh,mfh_sd,-ia,-ia_sd,h_dep_avg,h_codep_avg
0,afr_Latn,afr,afr,Latn,afri1274,Afrikaans,Indo-European,Germanic,0.63,0.00378,...,8.775156,0.020283,3.0,0.0,3.867342,0.550169,-0.667044,0.0,0.562348,0.462374
1,bul_Cyrl,bul,bul,Cyrl,bulg1262,Bulgarian,Indo-European,Slavic,0.48,0.0,...,9.801146,0.025004,9.6,0.489898,3.755937,0.593355,-0.349068,0.0,0.474122,0.681034
2,cat_Latn,cat,ctl,Latn,stan1289,Catalan,Indo-European,Romance,0.53,0.0,...,8.749058,0.027353,6.0,0.0,3.552882,0.336858,-0.517017,0.0,0.39136,0.624815
3,ces_Latn,ces,cze,Latn,czec1258,Czech,Indo-European,Slavic,0.66,0.0,...,10.207946,0.027642,11.4,0.519963,3.953692,0.776319,-0.327369,0.0,0.567528,0.872996
4,cmn_Hans,cmn,mnd,Hans,beij1234,Mandarin Chinese (Standard Beijing),Sino-Tibetan,Chinese,0.55,0.0,...,10.77949,0.023547,1.0,0.0,2.74136,0.148256,-0.998999,0.0,,
5,cmn_Hant,cmn,mnd,Hant,taib1240,Mandarin Chinese (Taiwanese),Sino-Tibetan,Chinese,0.55,0.0,...,10.77949,0.023547,1.0,0.0,2.74136,0.148256,-0.998999,0.0,,
6,ekk_Latn,ekk,est,Latn,esto1258,Estonian,Uralic,Finnic,0.69,0.0,...,10.86535,0.034473,9.93,0.406325,3.704048,0.648834,-0.177009,0.0,,
7,ell_Grek,ell,grk,Grek,mode1248,Greek,Indo-European,Greek,0.52,0.0,...,8.351142,0.021881,9.0,0.0,3.668588,0.509442,-0.347045,0.0,0.415117,0.675441
8,eng_Latn,eng,eng,Latn,stan1293,English,Indo-European,Germanic,0.0,0.328954,...,9.191172,0.025512,6.1375,0.236381,3.724105,0.361524,-0.670247,0.0,0.361244,0.384444
9,eus_Latn,eus,bsq,Latn,basq1248,Basque,Basque,Basque,0.61,0.0,...,9.67849,0.023058,19.13,1.411772,3.768162,0.77692,-0.317018,0.0,0.652487,0.874256


In [5]:
# Check for normality of "ttr" values
stats.shapiro(ttr_langs["ttr_flores"])

ShapiroResult(statistic=0.8109769225120544, pvalue=4.208420796203427e-05)

In [6]:
# Perform paired t-test on "ttr" and "ttr_flores" values
stats.ttest_rel(ttr_langs["ttr"], ttr_langs["ttr_flores"])

TtestResult(statistic=-2.6770029009751846, pvalue=0.011480873608365276, df=33)