In [2]:
from pathlib import Path
import polars as rs

In [27]:
files = Path("./responses").glob("*.txt")

responses = {str(file).replace("responses/", "").replace(".txt", ""): {"text": file.read_text()} for file in files}


Label the responses with the returned answer. Delete anything we can't easily extract an answer from.

In [28]:
delete_count = 0

for key, response in responses.items():
    answer = response['text'].replace("*", "").replace(".", "").strip()[-1]
    if answer in "ABCDEFGHIJ":
        response['label'] = answer
    else:
        print("-" * 30)
        print(response['text'].strip()[-20:])
        print("-" * 30)
        responses[key] = None
        delete_count += 1

filtered_responses = {k: v for k, v in responses.items() if v}

print(f"{delete_count} poorly labeled responses deleted.")

------------------------------

H: transverse waves
------------------------------
------------------------------
l Answer:
A: 1.00002
------------------------------
------------------------------
e.

Answer:
A: Valid
------------------------------
------------------------------
 is:

**E: 200 m/s**
------------------------------
------------------------------
think> ( ( ( ( ( ( (
------------------------------
------------------------------
: 490.0**.

H: 490.0
------------------------------
------------------------------
wer is J.

J: 30 m/s
------------------------------
------------------------------
F: parathyroid gland
------------------------------
------------------------------
and 1.13 slugs/sec**
------------------------------
------------------------------


**Answer: B: 50%**
------------------------------
------------------------------
ts.

Answer: H

H: 9
------------------------------
------------------------------
.

F: Durable goods.
------------------------------
----

Also add the label numerically.

In [31]:
for k, v in filtered_responses.items():
    v['num_label'] = ord(v['label']) - ord("A")

Reorder into a proper dataframe

In [38]:
structured_responses = {"uuid": [], "text": [], "label": [], "numeric label": []}

for uid, response in filtered_responses.items():
    structured_responses['uuid'] += [uid]
    structured_responses['text'] += [response['text']]
    structured_responses['label'] += [response['label']]
    structured_responses['numeric label'] += [response['num_label']]
    


df = rs.from_dict(structured_responses)
df

uuid,text,label,numeric label
str,str,str,i64
"""8cd2e5b4-f9fa-4977-9b62-356da9…","""Answer the following multiple …","""G""",6
"""3753ab9c-ae45-4d8b-8d61-a391b6…","""Answer the following multiple …","""G""",6
"""4f261530-6995-4dc0-8a87-a0e658…","""Answer the following multiple …","""C""",2
"""eb8ba016-48f2-4491-a0e3-3f751a…","""Answer the following multiple …","""H""",7
"""91fd60a8-ac44-428f-876b-eeeba0…","""Answer the following multiple …","""F""",5
…,…,…,…
"""9c31039f-a3e5-4100-8dce-a4e2a1…","""Answer the following multiple …","""A""",0
"""ec5b25bf-64fc-4e76-909a-b43920…","""Answer the following multiple …","""C""",2
"""95663208-b120-4973-9db2-dfad71…","""Answer the following multiple …","""B""",1
"""b797003d-cf86-4c25-b556-1aca07…","""Answer the following multiple …","""J""",9


In [42]:
df.write_ipc("responses.arrow", compression='zstd')

In [48]:
df.sample(5)

uuid,text,label,numeric label
str,str,str,i64
"""f4a40d73-9a7b-41f6-9fbf-f1c5c1…","""Answer the following multiple …","""D""",3
"""444ed118-5077-4616-8518-a0b57c…","""Answer the following multiple …","""C""",2
"""f1f38b68-04f9-4a37-bdec-efc621…","""Answer the following multiple …","""J""",9
"""0e9fc4de-37b3-47cd-9558-d1253d…","""Answer the following multiple …","""B""",1
"""2d32d8fa-34f4-4c26-9458-216bf7…","""Answer the following multiple …","""F""",5


In [49]:
df.sample(int(len(df)) * 0.8)

uuid,text,label,numeric label
str,str,str,i64
"""ddcad0c5-d75a-412b-80a1-dc5900…","""Answer the following multiple …","""C""",2
"""4d9eb9a4-8d0a-4298-a819-cd6d06…","""Answer the following multiple …","""D""",3
"""ca9ad8d8-7ff6-4e45-9af9-682dbc…","""Answer the following multiple …","""F""",5
"""934a6218-99cb-4419-81a5-ca8d76…","""Answer the following multiple …","""D""",3
"""583c5c4e-0b71-4c73-bf42-cd3dd2…","""Answer the following multiple …","""J""",9
…,…,…,…
"""a5990348-1fe6-4348-b4db-38abb8…","""Answer the following multiple …","""B""",1
"""f58469b7-d4c3-4658-9feb-c74a18…","""Answer the following multiple …","""B""",1
"""97349fa5-312b-4a1a-bdf9-4f36f0…","""Answer the following multiple …","""C""",2
"""1cdc3af1-e129-4574-8658-084802…","""Answer the following multiple …","""G""",6


In [51]:
df[:int(len(df)*0.8)]

uuid,text,label,numeric label
str,str,str,i64
"""8cd2e5b4-f9fa-4977-9b62-356da9…","""Answer the following multiple …","""G""",6
"""3753ab9c-ae45-4d8b-8d61-a391b6…","""Answer the following multiple …","""G""",6
"""4f261530-6995-4dc0-8a87-a0e658…","""Answer the following multiple …","""C""",2
"""eb8ba016-48f2-4491-a0e3-3f751a…","""Answer the following multiple …","""H""",7
"""91fd60a8-ac44-428f-876b-eeeba0…","""Answer the following multiple …","""F""",5
…,…,…,…
"""ecd8cd7c-fa51-4bfb-a144-f3b077…","""Answer the following multiple …","""C""",2
"""d3d32df1-0724-4552-a3be-03a2ee…","""Answer the following multiple …","""D""",3
"""7b32af92-1739-468b-a247-fd3bdf…","""Answer the following multiple …","""E""",4
"""e5228163-247d-4166-907f-efbac5…","""Answer the following multiple …","""C""",2


In [52]:
df[0]

uuid,text,label,numeric label
str,str,str,i64
"""8cd2e5b4-f9fa-4977-9b62-356da9…","""Answer the following multiple …","""G""",6


In [60]:
df.filter(rs.col("text").tail(1) in "ABCDEFGHIJ")

TypeError: 'in <string>' requires string as left operand, not Expr

uuid,text,label,numeric label
str,str,str,i64
"""8cd2e5b4-f9fa-4977-9b62-356da9…","""Answer the following multiple …","""G""",6
"""3753ab9c-ae45-4d8b-8d61-a391b6…","""Answer the following multiple …","""G""",6
"""4f261530-6995-4dc0-8a87-a0e658…","""Answer the following multiple …","""C""",2
"""eb8ba016-48f2-4491-a0e3-3f751a…","""Answer the following multiple …","""H""",7
"""91fd60a8-ac44-428f-876b-eeeba0…","""Answer the following multiple …","""F""",5
…,…,…,…
"""9c31039f-a3e5-4100-8dce-a4e2a1…","""Answer the following multiple …","""A""",0
"""ec5b25bf-64fc-4e76-909a-b43920…","""Answer the following multiple …","""C""",2
"""95663208-b120-4973-9db2-dfad71…","""Answer the following multiple …","""B""",1
"""b797003d-cf86-4c25-b556-1aca07…","""Answer the following multiple …","""J""",9


In [62]:
df

uuid,text,label,numeric label
str,str,str,i64
"""8cd2e5b4-f9fa-4977-9b62-356da9…","""Answer the following multiple …","""G""",6
"""3753ab9c-ae45-4d8b-8d61-a391b6…","""Answer the following multiple …","""G""",6
"""4f261530-6995-4dc0-8a87-a0e658…","""Answer the following multiple …","""C""",2
"""eb8ba016-48f2-4491-a0e3-3f751a…","""Answer the following multiple …","""H""",7
"""91fd60a8-ac44-428f-876b-eeeba0…","""Answer the following multiple …","""F""",5
…,…,…,…
"""9c31039f-a3e5-4100-8dce-a4e2a1…","""Answer the following multiple …","""A""",0
"""ec5b25bf-64fc-4e76-909a-b43920…","""Answer the following multiple …","""C""",2
"""95663208-b120-4973-9db2-dfad71…","""Answer the following multiple …","""B""",1
"""b797003d-cf86-4c25-b556-1aca07…","""Answer the following multiple …","""J""",9


In [64]:
import torch as t
ff = t.tensor([[0.1032, 0.0100, 0.0417, 0.0252, 0.0906, 0.0126, 0.4867, 0.0140, 0.0821,
         0.1340],
        [0.1581, 0.0237, 0.0542, 0.1459, 0.0387, 0.0236, 0.4146, 0.0337, 0.0619,
         0.0456]])

In [68]:
ff[range(2), [3, 2]]

tensor([0.0252, 0.0542])