# Download non-English, English dictionaries

In [2]:
# Download bilingual dictionaries: https://cs.brown.edu/people/epavlick/data.html
# "High-confidence translations collected via crowdsourcing"
!rm dictionaries.tar.gz
!wget http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz
!tar -xvzf dictionaries.tar.gz
!rm dictionaries.tar.gz

--2019-04-28 15:53:18--  http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz
Resolving www.seas.upenn.edu (www.seas.upenn.edu)... 158.130.68.91, 2607:f470:8:64:5ea5::9
Connecting to www.seas.upenn.edu (www.seas.upenn.edu)|158.130.68.91|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz [following]
--2019-04-28 15:53:18--  https://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz
Connecting to www.seas.upenn.edu (www.seas.upenn.edu)|158.130.68.91|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6015653 (5.7M) [application/x-gzip]
Saving to: ‘dictionaries.tar.gz’


2019-04-28 15:53:19 (10.2 MB/s) - ‘dictionaries.tar.gz’ saved [6015653/6015653]

dictionaries/
dictionaries/dict.af
dictionaries/dict.am
dictionaries/dict.an
dictionaries/dict.ar
dictionaries/dict.ast
dictionaries/dict.az
dictionaries/di

In [0]:
# Download the English dictionaries, with words mapped to their MMID ID.
!rm -rf english-dictionaries
!mkdir english-dictionaries
!for i in {1..9}; do wget https://s3.amazonaws.com/mmid-pds/language_index_files/index-english-0$i-package.tsv; done
!for i in {10..27}; do wget https://s3.amazonaws.com/mmid-pds/language_index_files/index-english-$i-package.tsv; done
!mv index* english-dictionaries/

In [3]:
!rm Classes.txt
!wget "https://raw.githubusercontent.com/yoninachmany/debiasing-imagenet-with-mmid-data/master/Classes.txt"

--2019-04-28 16:10:56--  https://raw.githubusercontent.com/yoninachmany/debiasing-imagenet-with-mmid-data/master/Classes.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1435 (1.4K) [text/plain]
Saving to: ‘Classes.txt’


2019-04-28 16:10:57 (257 MB/s) - ‘Classes.txt’ saved [1435/1435]



In [0]:
label_to_synonyms = {}
with open("Classes.txt") as f:
  data = f.read().split("\n")
  for line in data:
    label = line.split(",")[0].strip()
    synonyms = [synonym.strip().lower() for synonym in line.split(",")[1:]]
    label_to_synonyms[label] = synonyms
synonyms_to_label = dict([[val, key] for key, vals in label_to_synonyms.items() for val in vals])

In [0]:
import os
import pandas as pd

from tqdm import tqdm
from collections import defaultdict

In [0]:
languages_to_translations_to_ids = {}

for dict_fn in tqdm(sorted(os.listdir('dictionaries'))[1:]):
    language = dict_fn.split('.')[1]
    translations_to_ids = defaultdict(list)
    with open(f'dictionaries/{dict_fn}') as f:
        lines = f.read().splitlines()
        for i, line in enumerate(lines):
            english_translations = line.split('\t')[1:]
            for translation in english_translations:
                if translation in words_to_label:
                    translations_to_ids[translation].append(i+1)
    languages_to_translations_to_ids[language] = translations_to_ids

100%|██████████| 93/93 [00:00<00:00, 109.87it/s]


In [0]:
for dict_fn in tqdm(sorted(os.listdir('english-dictionaries'))[1:]):
    number = dict_fn.split("-")[2]
    english_translations_to_ids = defaultdict(list)
    with open(f'english-dictionaries/{dict_fn}') as f:
        lines = f.read().splitlines()
        for i, line in enumerate(lines):
            data = line.split('\t')
            if len(data) == 2:
                english_translation, folder_id = data
                if english_translation in words_to_label:
                    english_translations_to_ids[english_translation].append(folder_id)
    languages_to_translations_to_ids[f'english-{number}'] = english_translations_to_ids

 38%|███▊      | 10/26 [00:00<00:00, 91.04it/s]

index-english-02-package.tsv
index-english-03-package.tsv
index-english-04-package.tsv
index-english-05-package.tsv
index-english-06-package.tsv
index-english-07-package.tsv
index-english-08-package.tsv
index-english-09-package.tsv
index-english-10-package.tsv
index-english-11-package.tsv
index-english-12-package.tsv
index-english-13-package.tsv
index-english-14-package.tsv
index-english-15-package.tsv
index-english-16-package.tsv
index-english-17-package.tsv
index-english-18-package.tsv
index-english-19-package.tsv
index-english-20-package.tsv


100%|██████████| 26/26 [00:00<00:00, 91.55it/s]

index-english-21-package.tsv
index-english-22-package.tsv
index-english-23-package.tsv
index-english-24-package.tsv
index-english-25-package.tsv
index-english-26-package.tsv
index-english-27-package.tsv





In [0]:
df = pd.DataFrame(languages_to_translations_to_ids)
df.head()

Unnamed: 0,af,am,an,ar,ast,az,bcl,be,bg,bn,...,english-18,english-19,english-20,english-21,english-22,english-23,english-24,english-25,english-26,english-27
acre farmers,[293],,,,,,,,,,...,,,,,,,,,,
agriculturist,,,,,,,,,,,...,,,,[6614],,,,,,
anglican,[422],,,,,,,,,,...,,,,,[5425],,,,,
animal farmer,,,,,,,,,,,...,,,,,,,,,,
archbishop,[142],,[164],,,,"[418, 419]",,[285],,...,,,,,,,,,,


In [0]:
# # https://stackoverflow.com/questions/21164910/how-do-i-delete-a-column-that-contains-only-zeros-in-pandas
# df = df.loc[:, (df != 0.0).any(axis=0)]

In [0]:
# df['all'] = df.sum(axis=1)
# s = df.sum(axis=0)
# s = s.rename('all')
# df = df.append(s)
# df

In [0]:
# df = df.sort_values('all', ascending=False)
# df

In [0]:
# df = df.sort_values('all', axis=1, ascending=False)
# df
# # Top languages: English, Afrikaans, Polish, Portuguese, Turkish, Galician, Catalan, Spanish, Malayalam
# # Top professions: historians, actors, models, producers, singers, astronomers, clergy, infantry, editors, physicists

In [0]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# all_lang_counts = list(df.loc['all', :])
# all_lang_counts = all_lang_counts[1:]
# plt.hist(all_lang_counts, bins=30);

In [0]:
top_lang_df = df[['english-02', 'english-03', 'english-04', 'english-05', 'english-06', 'english-07', 'english-08', 'english-09', 'english-10', 'english-11', 'english-12', 'english-13', 'english-14', 'english-15', 'english-16', 'english-17', 'english-18', 'english-19', 'english-20', 'english-21', 'english-22', 'english-23', 'english-24', 'english-25', 'english-26', 'english-27', 'bn', 'bpy', 'gu', 'hi', 'kn', 'ml', 'mr', 'pa',  'ta', 'te']] 
# top_lang_df['all'] = top_lang_df.sum(axis=1)
# top_lang_df = top_lang_df.sort_values('all', ascending=False)
# top_lang_df = top_lang_df.query('all>1')#.query('en>1')
# print(top_lang_df.shape)
top_lang_df

Unnamed: 0,english-02,english-03,english-04,english-05,english-06,english-07,english-08,english-09,english-10,english-11,...,bn,bpy,gu,hi,kn,ml,mr,pa,ta,te
acre farmers,,,,,,,,[8541],,,...,,,,,,,,,,
agriculturist,,,,,,,,,,,...,,,,,,,,,,[8417]
anglican,,,,,,,,,,,...,,,,,,[678],,,,
animal farmer,,,[455],,,,,,,,...,,,,,,,,,,
archbishop,,,,,,,,[5246],,,...,,,,,,,,,,
army,,,,,[1423],,,,,,...,"[841, 9143, 9146, 9149, 9150, 9182, 9185]",,"[5402, 5420, 7286, 7290, 9204, 9207, 9235]","[739, 3469, 8512, 8529]","[779, 4555, 4558, 7919, 8804, 8806, 8807, 8810...","[972, 5301, 9738, 9739, 9741, 9754, 9757]","[742, 1687, 5370, 5371, 5372, 9121, 9125, 9145...","[722, 2953, 5433, 5495, 5496, 5567, 5568, 9079...","[1412, 6209, 6222, 6223, 8381, 8382]","[770, 9398, 9399, 9402, 9421, 9425, 9428]"
athlete,,,,,,,,,,,...,,,,,,,,,,
athletes,,,,,,,,[429],,,...,,,,,[2390],,,,,
athletics,,,,,[1399],,,,,,...,,,,,,,,,,
babies,,,,,,,,,,,...,[8229],,,[7558],,[2611],,,,


In [0]:
top_lang_df['label'] = top_lang_df.index.map(lambda word: words_to_label[word])
top_lang_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,english-02,english-03,english-04,english-05,english-06,english-07,english-08,english-09,english-10,english-11,...,bpy,gu,hi,kn,ml,mr,pa,ta,te,label
acre farmers,,,,,,,,[8541],,,...,,,,,,,,,,Farmer
agriculturist,,,,,,,,,,,...,,,,,,,,,[8417],Farmer
anglican,,,,,,,,,,,...,,,,,[678],,,,,Religious? Priest
animal farmer,,,[455],,,,,,,,...,,,,,,,,,,Farmer
archbishop,,,,,,,,[5246],,,...,,,,,,,,,,Religious? Priest
army,,,,,[1423],,,,,,...,,"[5402, 5420, 7286, 7290, 9204, 9207, 9235]","[739, 3469, 8512, 8529]","[779, 4555, 4558, 7919, 8804, 8806, 8807, 8810...","[972, 5301, 9738, 9739, 9741, 9754, 9757]","[742, 1687, 5370, 5371, 5372, 9121, 9125, 9145...","[722, 2953, 5433, 5495, 5496, 5567, 5568, 9079...","[1412, 6209, 6222, 6223, 8381, 8382]","[770, 9398, 9399, 9402, 9421, 9425, 9428]",Soldier
athlete,,,,,,,,,,,...,,,,,,,,,,Athlete
athletes,,,,,,,,[429],,,...,,,,[2390],,,,,,Athlete
athletics,,,,,[1399],,,,,,...,,,,,,,,,,Athlete
babies,,,,,,,,,,,...,,,[7558],,[2611],,,,,Children/Toddler/


In [0]:
# grouped_df = top_lang_df.groupby('label').sum()
# grouped_df = grouped_df.sort_values('all', ascending=False)
# grouped_df = grouped_df.sort_values('all', axis=1, ascending=False)
# grouped_df

Unnamed: 0_level_0,en,all
label,Unnamed: 1_level_1,Unnamed: 2_level_1
all,108.0,108.0
Religious? Priest,23.0,23.0
Police,16.0,16.0
Athlete,12.0,12.0
King/Queen/Prince/Princess,11.0,11.0
Children/Toddler/,8.0,8.0
Doctor,8.0,8.0
Soldier,8.0,8.0
Wedding,8.0,8.0
Farmer,7.0,7.0


In [0]:
top_lang_df.to_csv('our_langs_ids.csv')

In [0]:
from google.colab import files
files.download('our_langs_ids.csv') 