In [18]:
import pandas as pd
from urllib.parse import urlparse
import langdetect

# Set it to being deterministic at language detection
langdetect.DetectorFactory.seed = 0

In [19]:
def detect_text_language(text):
    try:
        return langdetect.detect(text)
    except langdetect.lang_detect_exception.LangDetectException:
        return "Unknown"

In [20]:
df = pd.read_csv("all-languages.csv")
df.shape

(69329, 6)

In [21]:
df["tld"] = df["url"].map(lambda u: urlparse(u).hostname.split(".")[-1])

In [22]:
df["language"] = df["text"].map(lambda t: detect_text_language(t))

In [23]:
percents = pd.DataFrame(columns=["tld", "percent", "total"])

for tld in df["tld"].unique():
    total = df.query("tld == @tld").shape[0]
    if total < 50:
        continue

    en_count = df.query("tld == @tld and language == 'en'").shape[0]
    
    percent = en_count/total * 100
    percents.loc[-1] = [tld, percent, total]
    percents.index = percents.index + 1  # shifting index
    percents = percents.sort_index()  # sorting by index

In [27]:
percents.sort_values("percent", ascending=False, ignore_index=True)

Unnamed: 0,tld,percent,total
0,ke,100.0,68
1,nz,97.101449,138
2,au,96.740741,675
3,uk,96.657682,1855
4,edu,96.471418,1417
5,gov,95.752896,518
6,us,94.270833,192
7,link,92.156863,51
8,top,91.666667,84
9,online,90.721649,97


In [29]:
percents.query("percent > 85").sort_values("percent", ascending=False)["tld"].tolist()

['ke',
 'nz',
 'au',
 'uk',
 'edu',
 'gov',
 'us',
 'link',
 'top',
 'online',
 'in',
 'cc',
 'za',
 'id',
 'app']