In [48]:
from pathlib import Path
import polars as rs

In [49]:
directory = "./responses-deepseek-ai-DeepSeek-R1-Distill-Llama-70B-free"

files = Path(directory).glob("*.txt")
responses = {str(file).replace(directory, "").replace(".txt", ""): {"text": file.read_text()} for file in files}

Label the responses with the returned answer. Delete anything we can't easily extract an answer from.

In [50]:
from transformers import AutoTokenizer

model = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free"
# "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model.replace("-free", ""))

In [51]:
delete_count = 0
too_long_count = 0
other_count = 0

for key, response in responses.items():
    answer = response['text'].replace("*", "").replace(".", "").strip()[-1]
    if answer in "ABCDEFGHIJ":
        response['label'] = answer
    else:


        # Extract the answer itself
        answer = response['text'].split("Answer:\n    <think>\n")[1]
        tokens = tokenizer(answer)['input_ids']

        # A lot of the time, the model just hit the output length limit.
        # If that's *NOT* the case, we print out the end of the response.
        if len(tokens) != 4097:
            print("-" * 30)
            print(key)
            print(len(tokens))
            print(response['text'][-50:])
            print("-" * 30)
            other_count += 1
        else:
            too_long_count += 1
            
        responses[key] = None
        delete_count += 1

filtered_responses = {k: v for k, v in responses.items() if v}

print(f"{delete_count} poorly labeled responses deleted.")
print(f"{too_long_count=}")
print(f"{other_count=}")

------------------------------
responses-deepseek-ai-DeepSeek-R1-Distill-Llama-70B-free/26183ed3-4bbe-4a59-9b12-87faf5de9e22
1148
refore, the correct answer is:

D: Greek Thinking.
------------------------------
------------------------------
responses-deepseek-ai-DeepSeek-R1-Distill-Llama-70B-free/e2fcdbda-f8f8-4f54-8d69-b5be35251c30
4096
it, perhaps I should look for the relation between
------------------------------
------------------------------
responses-deepseek-ai-DeepSeek-R1-Distill-Llama-70B-free/6ecbed10-add7-4d17-884c-d741fd0b92fe
1186
 A \) is \( 0 \).

The correct answer is **B: 0**.
------------------------------
------------------------------
responses-deepseek-ai-DeepSeek-R1-Distill-Llama-70B-free/2749a856-f567-4c0b-912c-fc4f6f1ad4c6
1358
x^5 + 8x - 7 \) has exactly one real root.

J: One
------------------------------
------------------------------
responses-deepseek-ai-DeepSeek-R1-Distill-Llama-70B-free/49524ebf-b551-4af2-ac39-e55b2f493a3f
3774
rox 0.592 \). The corr

Also add the label numerically.

In [52]:
for k, v in filtered_responses.items():
    v['num_label'] = ord(v['label']) - ord("A")

Reorder into a proper dataframe

In [53]:
structured_responses = {"uuid": [], "text": [], "label": [], "numeric label": []}

for uid, response in filtered_responses.items():
    structured_responses['uuid'] += [uid]
    structured_responses['text'] += [response['text']]
    structured_responses['label'] += [response['label']]
    structured_responses['numeric label'] += [response['num_label']]
    


df = rs.from_dict(structured_responses)
df

uuid,text,label,numeric label
str,str,str,i64
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""D""",3
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""B""",1
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""A""",0
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""C""",2
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""H""",7
…,…,…,…
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""G""",6
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""E""",4
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""B""",1
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""J""",9


In [55]:
df.write_ipc("responses_70b.arrow", compression='zstd')

In [56]:
df.sample(5)

uuid,text,label,numeric label
str,str,str,i64
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""E""",4
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""D""",3
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""F""",5
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""F""",5
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""C""",2


In [57]:
df.sample(int(len(df)) * 0.8)

uuid,text,label,numeric label
str,str,str,i64
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""J""",9
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""F""",5
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""G""",6
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""E""",4
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""H""",7
…,…,…,…
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""A""",0
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""C""",2
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""E""",4
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""G""",6


In [21]:
df[:int(len(df)*0.8)]

uuid,text,label,numeric label
str,str,str,i64
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""D""",3
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""B""",1
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""A""",0
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""C""",2
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""H""",7
…,…,…,…
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""J""",9
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""D""",3
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""B""",1
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""D""",3


In [22]:
df[0]

uuid,text,label,numeric label
str,str,str,i64
"""responses-deepseek-ai-DeepSeek…","""Answer the following multiple …","""D""",3


In [58]:
for i in range(50):
    sampled = df.sample(1)
    print(sampled['text'][0], f"label={sampled['label'][0]}")

Answer the following multiple choice question, selecting from the answer A through to J. After thinking, reply directly with your answer. 

The last character of your response must be the letter you have chosen.

Question:

Which of the following statements about presidential elections since 1972 is most accurate?

Options:

A: Voters increasingly vote based on a party's platform.
B: Voters increasingly vote based on a candidate's gender.
C: Voters have become less interested in politics.
D: Voters increasingly vote based on a candidate's personal life.
E: Elections have become less competitive.
F: Voters increasingly get their information from newspapers.
G: Voters have become more focused on individual candidates.

Answer:
    <think>
<think>
Okay, so I have this multiple-choice question about presidential elections since 1972, and I need to figure out the correct answer. Let me think through this step by step.

The question is asking which statement is most accurate regarding trends