from transformers import AutoTokenizer, LlamaForCausalLM
import pandas as pd
import numpy as np
import random

In [2]:
random.seed(2024)

In [3]:
c = pd.read_csv("us-state-capitals.csv")
d = pd.read_csv("uscities.csv")

c = c[['name', 'description']]
c.columns = ['state', 'capital']
c['capital'] = c['capital'].apply(lambda x: x.rstrip('<br>') if x.endswith('<br>') else x)

d = d.loc[d.groupby('state_name')['population'].idxmax()][['state_name', 'city']].reset_index(drop = True)
d.columns = ['state', 'largest']

joined = pd.merge(c, d, on = 'state')

In [4]:
diff_cap_larg = list(joined[joined['capital'] != joined['largest']]['state'])

In [5]:
others = [x for x in joined['state'] if x not in diff_cap_larg]

In [6]:
len(diff_cap_larg), len(others)

(33, 17)

In [7]:
cc_dict = joined.set_index('state')['capital'].to_dict()

In [8]:
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", 
                                         token = "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")

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", 
                                          token = "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")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Creating a prompt

Each prompt contains 6 in-context examples of country and its capital city.

In order to avoid ambiguity of capital vs largest city, exactly 3 of the examples are from diff_cap_larg.

Last prompt: 500 randomly selected from diff_cap_larg, 500 randomly selected from others

In [9]:
def create_prompt_and_answer(type_):
    
    if type_ == "diff_cap_larg":
        
        temp = random.sample(diff_cap_larg, 4)
        country_qn = temp[0]
        countries_prompt = temp[1:] + random.sample(others, 3)
        random.shuffle(countries_prompt)  
        
    elif type_ == "others":
        
        temp = random.sample(others, 4)
        country_qn = temp[0]
        countries_prompt = temp[1:] + random.sample(diff_cap_larg, 3)
        random.shuffle(countries_prompt)  

    sentence = ''
    
    for c in countries_prompt:
        
        sentence += c
        sentence += ' '
        sentence += cc_dict[c]
        sentence += ', '
        
    sentence += country_qn
    
    return (sentence, cc_dict[country_qn])

In [10]:
###### DIFF_CAP_LARG QUESTIONS

dcl_prompts = []
dcl_keys = []
dcl_outputs = []

for i in range(500):
    
    if i % 10 == 0:
        print('Starting ' + str(i))
    
    prompt, key = create_prompt_and_answer(type_ = "diff_cap_larg")
    inputs = tokenizer(prompt, return_tensors="pt")
    length = len(inputs['input_ids'][0])
    generate_ids = model.generate(inputs.input_ids, max_length = length + 12)
    output = tokenizer.batch_decode(generate_ids[:,length:], skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)[0]
    
    dcl_prompts.append(prompt)
    dcl_keys.append(key)
    dcl_outputs.append(output)

Starting 0
Starting 10
Starting 20
Starting 30
Starting 40
Starting 50
Starting 60
Starting 70
Starting 80
Starting 90
Starting 100
Starting 110
Starting 120
Starting 130
Starting 140
Starting 150
Starting 160
Starting 170
Starting 180
Starting 190
Starting 200
Starting 210
Starting 220
Starting 230
Starting 240
Starting 250
Starting 260
Starting 270
Starting 280
Starting 290
Starting 300
Starting 310
Starting 320
Starting 330
Starting 340
Starting 350
Starting 360
Starting 370
Starting 380
Starting 390
Starting 400
Starting 410
Starting 420
Starting 430
Starting 440
Starting 450
Starting 460
Starting 470
Starting 480
Starting 490


In [11]:
###### OTHERS QUESTIONS

oth_prompts = []
oth_keys = []
oth_outputs = []

for i in range(500):
    
    if i % 10 == 0:
        print('Starting ' + str(i))
    
    prompt, key = create_prompt_and_answer(type_ = "others")
    inputs = tokenizer(prompt, return_tensors="pt")
    length = len(inputs['input_ids'][0])
    generate_ids = model.generate(inputs.input_ids, max_length = length + 12)
    output = tokenizer.batch_decode(generate_ids[:,length:], skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)[0]
    
    oth_prompts.append(prompt)
    oth_keys.append(key)
    oth_outputs.append(output)

Starting 0
Starting 10
Starting 20
Starting 30
Starting 40
Starting 50
Starting 60
Starting 70
Starting 80
Starting 90
Starting 100
Starting 110
Starting 120
Starting 130
Starting 140
Starting 150
Starting 160
Starting 170
Starting 180
Starting 190
Starting 200
Starting 210
Starting 220
Starting 230
Starting 240
Starting 250
Starting 260
Starting 270
Starting 280
Starting 290
Starting 300
Starting 310
Starting 320
Starting 330
Starting 340
Starting 350
Starting 360
Starting 370
Starting 380
Starting 390
Starting 400
Starting 410
Starting 420
Starting 430
Starting 440
Starting 450
Starting 460
Starting 470
Starting 480
Starting 490


In [12]:
dcl_df = pd.DataFrame(columns = ['prompts', 'keys', 'outputs'])
dcl_df['prompts'] = dcl_prompts
dcl_df['keys'] = dcl_keys
dcl_df['outputs'] = dcl_outputs

dcl_df.to_csv('dcl_df_state.csv')

In [13]:
oth_df = pd.DataFrame(columns = ['prompts', 'keys', 'outputs'])
oth_df['prompts'] = oth_prompts
oth_df['keys'] = oth_keys
oth_df['outputs'] = oth_outputs

oth_df.to_csv('oth_df_state.csv')

### Section 2: Motivation

In [15]:
prompt = "dog anjing, cat kucing, lion singa, elephant"

inputs = tokenizer(prompt, return_tensors="pt")
length = len(inputs['input_ids'][0])
generate_ids = model.generate(inputs.input_ids, max_length = length + 12)
output = tokenizer.batch_decode(generate_ids[:,length:], skip_special_tokens=True, 
                                clean_up_tokenization_spaces=False)[0]

output

'gajah, horse kuda, mouse kucing'

In [18]:
prompt = "dog anjing, cat kucing, lion singa, tiger"

inputs = tokenizer(prompt, return_tensors="pt")
length = len(inputs['input_ids'][0])
generate_ids = model.generate(inputs.input_ids, max_length = length + 12)
output = tokenizer.batch_decode(generate_ids[:,length:], skip_special_tokens=True, 
                                clean_up_tokenization_spaces=False)[0]

output

'harimau, bear beruang, rabbit b'

In [19]:
prompt = "dog anjing, cat kucing, lion singa, soon"

inputs = tokenizer(prompt, return_tensors="pt")
length = len(inputs['input_ids'][0])
generate_ids = model.generate(inputs.input_ids, max_length = length + 12)
output = tokenizer.batch_decode(generate_ids[:,length:], skip_special_tokens=True, 
                                clean_up_tokenization_spaces=False)[0]

output

'-to-be-published, traditional Chinese\nWhen'

In [20]:
prompt = "dog anjing, cat kucing, lion singa, main"

inputs = tokenizer(prompt, return_tensors="pt")
length = len(inputs['input_ids'][0])
generate_ids = model.generate(inputs.input_ids, max_length = length + 12)
output = tokenizer.batch_decode(generate_ids[:,length:], skip_special_tokens=True, 
                                clean_up_tokenization_spaces=False)[0]

output

'an, and more\nEpisode 1 - Bers'