## Loading Data

In [9]:
import glob

langs = ['en', 'de', 'fi', 'ja']

tasks = ['negation1', 'negation2', 
         'numerals1', 'numerals2', 
         'spatial1', 'spatial2', 
         'temporal1', 'temporal2', 
         'comparative1', 'comparative2',
         'zh_ja_counters']

files = [{'task': task, 'lang': lang, 'path': file} for lang in langs for task in tasks for file in glob.glob(f'/home/norrman/GitHub/multi-morph-checklist/export/{task}*/{lang}.txt')]

files

[{'task': 'negation1',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/negation1/en.txt'},
 {'task': 'negation2',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/negation2/en.txt'},
 {'task': 'numerals1',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/numerals1/en.txt'},
 {'task': 'numerals2',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/numerals2/en.txt'},
 {'task': 'spatial1',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/spatial1/en.txt'},
 {'task': 'spatial2',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/spatial2/en.txt'},
 {'task': 'temporal1',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/temporal1/en.txt'},
 {'task': 'temporal2',
  'lang': 'en',
  'path': '/home/norrman/GitHub/multi-morph-checklist/export/temporal2/en.txt'},
 {'task': 'comparative1',
  'lang': 'en',
  

In [10]:
import json
import os
path = '/home/norrman/GitHub/multi-morph-checklist/export/M2C.json'

if not os.path.isfile(path):
    data = {}
    ind = 0
    for doc in files:
        with open(doc['path'], 'r') as f:
            for line in f:
                context, question, answer = line.strip().replace('C: ', '').replace('Q: ', '|').replace('A: ', '|').split('|')
                prefix = {'ja': '日本語で答えてください。',
                          'en': 'Please answer the question in English.',
                          'de': 'Bitte beantworten Sie die Frage auf Deutsch.',
                          'fi': 'Vastaa kysymykseen suomeksi.'}
                data[ind] = {
                              'task': doc['task'],
                              'lang': doc['lang'],
                              'prefix': prefix[doc['lang']],
                              'context': context,
                              'question': question,
                              'answer': answer,
                              'prompt': f'{prefix[doc["lang"]]} C: {context} Q: {question} A:'
                            }
                ind += 1


    json.dump(data, open(path, 'w'))
else:
    data = json.load(open('/home/norrman/GitHub/multi-morph-checklist/export/M2C.json', 'r'))

## Running Model

In [11]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

XGLM_tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-564M", padding_side='left')
XGLM_model = AutoModelForCausalLM.from_pretrained("facebook/xglm-564M")
XGLM_model = XGLM_model.to(device='cuda')

XGLM_model

  from .autonotebook import tqdm as notebook_tqdm


XGLMForCausalLM(
  (model): XGLMModel(
    (embed_tokens): Embedding(256008, 1024, padding_idx=1)
    (embed_positions): XGLMSinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-23): 24 x XGLMDecoderLayer(
        (self_attn): XGLMAttention(
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (activation_fn): GELUActivation()
        (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (fc1): Linear(in_features=1024, out_features=4096, bias=True)
        (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine

In [12]:
from collections import Counter

counter = Counter()
for item in data.values():
    counter[(item['task'], item['lang'])] += 1

counter

len(data)

30000

In [13]:
import tqdm

for id, item in tqdm.tqdm(data.items(), total=len(data)):
    inputs = XGLM_tokenizer(item['prompt'], return_tensors='pt', padding=True)
    inputs = inputs.to(device='cuda')
    outputs = XGLM_model.generate(**inputs, 
                                max_new_tokens=15,
                                num_beams=2)

    data[id]['response'] = XGLM_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][len(item['prompt'])-1:]

  0%|          | 0/30000 [00:00<?, ?it/s]

100%|██████████| 30000/30000 [1:46:24<00:00,  4.70it/s]


In [14]:
data

{'0': {'task': 'negation1',
  'lang': 'en',
  'prefix': 'Please answer the question in English.',
  'context': 'Susan is not an architect, but Lola is. ',
  'question': 'Who is an architect? ',
  'answer': 'Lola.',
  'prompt': 'Please answer the question in English. C: Susan is not an architect, but Lola is.  Q: Who is an architect?  A:',
  'response': 'Susan is an architect, Lola is an architect. Q: What is the'},
 '1': {'task': 'negation1',
  'lang': 'en',
  'prefix': 'Please answer the question in English.',
  'context': 'John is not a lawyer, but Sophia and Charlotte are. ',
  'question': 'Who are lawyers? ',
  'answer': 'Sophia and Charlotte.',
  'prompt': 'Please answer the question in English. C: John is not a lawyer, but Sophia and Charlotte are.  Q: Who are lawyers?  A:',
  'response': 'John is a lawyer, but Sophia and Charlotte are lawyers. Q: What'},
 '2': {'task': 'negation1',
  'lang': 'en',
  'prefix': 'Please answer the question in English.',
  'context': 'Kirk is not a 

In [15]:
path = '/home/norrman/GitHub/multi-morph-checklist/export/M2C_XGLM.json'

json.dump(data, open(path, 'w'), indent=2)

In [6]:
import json
data = json.load(open('/home/norrman/GitHub/multi-morph-checklist/export/M2C_XGLM.json', 'r'))

In [7]:
japanese_data = {id: item for id, item in data.items() if item['lang'] == 'ja' and item['task'] == 'negation2'}

In [8]:
japanese_data

{'16500': {'task': 'negation2',
  'lang': 'ja',
  'prefix': '日本語で答えてください。',
  'context': '明梨と優花は弁護士、結衣と智子は研究員です。',
  'question': '弁護士でないのは誰ですか。',
  'answer': '結衣と智子です。',
  'prompt': '日本語で答えてください。 C: 明梨と優花は弁護士、結衣と智子は研究員です。 Q: 弁護士でないのは誰ですか。 A:',
  'response': ': 明梨と優花は弁護士です。 Q: 優'},
 '16501': {'task': 'negation2',
  'lang': 'ja',
  'prefix': '日本語で答えてください。',
  'context': '真由美と敬子は警官、良子と桜は監督です。',
  'question': '警官でないのは誰ですか。',
  'answer': '良子と桜です。',
  'prompt': '日本語で答えてください。 C: 真由美と敬子は警官、良子と桜は監督です。 Q: 警官でないのは誰ですか。 A:',
  'response': ': 警官は、警察官の役割を果たすために'},
 '16502': {'task': 'negation2',
  'lang': 'ja',
  'prefix': '日本語で答えてください。',
  'context': '笑子は教師、海斗は調理師です。',
  'question': '教師でないのは誰ですか。',
  'answer': '海斗です。',
  'prompt': '日本語で答えてください。 C: 笑子は教師、海斗は調理師です。 Q: 教師でないのは誰ですか。 A:',
  'response': ': 海斗です。 Q: 海斗は、教師ではありませんか'},
 '16503': {'task': 'negation2',
  'lang': 'ja',
  'prefix': '日本語で答えてください。',
  'context': '富美子は歌手、龍は弁護士です。',
  'question': '歌手でないのは誰ですか。',
  'answer': '龍です。',
  'prompt': '日本語