In [2]:
import os
import duckdb
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
# ------------- io helpers -------------
def read_parquet_any(path: str) -> pd.DataFrame:
    """
    Read parquet using pyarrow if available, else fastparquet.
    """
    try:
        return pd.read_parquet(path, engine="pyarrow")
    except Exception:
        return pd.read_parquet(path, engine="fastparquet")


def write_parquet_any(df: pd.DataFrame, path: str) -> None:
    """
    Write parquet using pyarrow if available, else fastparquet.
    """
    try:
        df.to_parquet(path, engine="pyarrow", index=False)
    except Exception:
        df.to_parquet(path, engine="fastparquet", index=False)


In [4]:
os.getcwd()

'/home/xpan02/CASNL/topic_recurrence'

In [5]:
CONVO_INPUT_DATA_PATH = '/project/ycleong/datasets/CANDOR'
FRIENDS_INPUT_DATA_PATH = '/project/ycleong/datasets/Friends'

# input / output files
BACKBITER_PARQUET = os.path.join(CONVO_INPUT_DATA_PATH, 'chunk_topic-num.parquet')
SURVEY_PARQUET = os.path.join(CONVO_INPUT_DATA_PATH, 'survey.ALL.parquet')

FRIENDS_PARQUET = os.path.join(FRIENDS_INPUT_DATA_PATH, 'friends_chunk_topic-num.parquet')

backbiter = read_parquet_any(BACKBITER_PARQUET)
survey = read_parquet_any(SURVEY_PARQUET)

friends = read_parquet_any(FRIENDS_PARQUET)

In [6]:
backbiter.head()


Unnamed: 0,chunk_id,conversation_id,chunk_text,topic
0,0,0020a0c5-1658-4747-99c1-2839e736b481,"Mhm. Mhm. Just, mm. And Uh huh, mm. Mhm. Mhm. ...",-1
1,1,0020a0c5-1658-4747-99c1-2839e736b481,this is actually my first one so uh yeah it's ...,-1
2,2,0020a0c5-1658-4747-99c1-2839e736b481,"thanks, Tiny firm sponge. I swear that's that'...",10
3,3,0020a0c5-1658-4747-99c1-2839e736b481,but I do really like sleep quite a bit and I h...,-1
4,4,0020a0c5-1658-4747-99c1-2839e736b481,that sounds really cute. So what are their nam...,10


In [7]:
len(backbiter)

84358

In [8]:
backbiter["topic"].value_counts().sort_index()

topic
-1      44809
 0       2211
 1       1932
 2       1830
 3       1701
        ...  
 364       10
 365       10
 366       10
 367       10
 368       10
Name: count, Length: 370, dtype: int64

In [9]:
friends.head()

Unnamed: 0,chunk_id_orig,n_words,end_turn_id,start_turn_id,episode_id,season_id,scene_id,chunk_id,chunk_text,topic,topic_words
0,0,117,10,1,s01_e01,s01,s01_e01_c01,0,There's nothing to tell! He's just some guy I ...,-1,
1,1,120,25,11,s01_e01,s01,s01_e01_c01,1,"Oh, yeah. Had that dream. Then I look down, an...",66,"dream, had, dreamt, he, bed, jack, night, saw,..."
2,2,109,39,26,s01_e01,s01,s01_e01_c01,2,Carol moved her stuff out today. Ohh. Let me g...,-1,
3,3,103,48,40,s01_e01,s01,s01_e01_c01,3,"Oh really, so that hysterical phone call I got...",63,"married, rachel, annulment, rog, angelica, ros..."
4,4,112,55,49,s01_e01,s01,s01_e01_c01,4,Oh God Monica hi! Thank God! I just went to yo...,-1,


In [10]:
len(friends)

7708

In [11]:
friends["topic"].value_counts().sort_index()

topic
-1     4095
 0      317
 1      279
 2      128
 3      120
       ... 
 84      11
 85      11
 86      10
 87      10
 88      10
Name: count, Length: 90, dtype: int64

### Conversation

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import json
import numpy as np
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm

from scripts.parquet_helper import read_parquet_any

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
import sys, importlib.util
import transformers, accelerate, huggingface_hub

print("python:", sys.executable)
print("transformers:", transformers.__version__)
print("accelerate:", accelerate.__version__)
print("huggingface_hub:", huggingface_hub.__version__)
print("accelerate spec:", importlib.util.find_spec("accelerate"))


python: /software/python-anaconda-2023.09-el8-x86_64/envs/rapids-24.12/bin/python
transformers: 4.57.1
accelerate: 1.12.0
huggingface_hub: 0.35.3
accelerate spec: ModuleSpec(name='accelerate', loader=<_frozen_importlib_external.SourceFileLoader object at 0x7f88eb307800>, origin='/home/xpan02/.local/lib/python3.12/site-packages/accelerate/__init__.py', submodule_search_locations=['/home/xpan02/.local/lib/python3.12/site-packages/accelerate'])


In [14]:
model_id = "meta-llama/Llama-3.3-70B-Instruct"
bnb = BitsAndBytesConfig(load_in_8bit=True)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.bfloat16,
    quantization_config=bnb,
)
tok = AutoTokenizer.from_pretrained(model_id)

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
OUT_CSV = "topic-label_all.csv"

N_PER_TOPIC = 10
SEED = 42

In [61]:
backbiter.head()

Unnamed: 0,chunk_id,conversation_id,chunk_text,topic
0,0,0020a0c5-1658-4747-99c1-2839e736b481,"Mhm. Mhm. Just, mm. And Uh huh, mm. Mhm. Mhm. ...",-1
1,1,0020a0c5-1658-4747-99c1-2839e736b481,this is actually my first one so uh yeah it's ...,-1
2,2,0020a0c5-1658-4747-99c1-2839e736b481,"thanks, Tiny firm sponge. I swear that's that'...",10
3,3,0020a0c5-1658-4747-99c1-2839e736b481,but I do really like sleep quite a bit and I h...,-1
4,4,0020a0c5-1658-4747-99c1-2839e736b481,that sounds really cute. So what are their nam...,10


In [63]:
topics = sorted(backbiter["topic"].dropna().unique().tolist())
topics[:10], topics[-10:]

([-1, 0, 1, 2, 3, 4, 5, 6, 7, 8],
 [359, 360, 361, 362, 363, 364, 365, 366, 367, 368])

In [64]:
topics_no_noise = [t for t in topics if t != -1]
len(topics), len(topics_no_noise)

(370, 369)

In [65]:
def sample_topic_texts(group: pd.DataFrame, n: int = 20, seed: int = 42) -> list[str]:
    return (
        group["chunk_text"]
        .dropna()
        .astype(str)
        .sample(n=min(n, len(group)), random_state=seed)
        .tolist()
    )

In [67]:
t = topics_no_noise[0]
group = backbiter[backbiter["topic"] == t]
examples = sample_topic_texts(group, n=N_PER_TOPIC, seed=SEED)

len(examples), examples[0][:200]

(20,
 "people just don't know as much, so it's better for them to take their time and figure it out rather than just like going, spending too much money on school and having to be Square Hey stomach? No. Yea")

In [59]:
PROMPT = """
You are an expert annotator analyzing a latent conversation topic.
All the text chunks below come from the same topic.

### Topic ID: {topic_id}

### Example text chunks
{chunk_examples}

### Task
Based on these examples, infer the underlying topic.
Produce only a one-row Markdown table with:

- topic_id: {topic_id}
- short_label: a concise 2–5 word name
- summary: one sentence describing what people are doing or discussing in this topic
- keywords: 3–8 key words or phrases (comma separated)

### Output format (very important)
| topic_id | short_label | summary | keywords |
|----------|-------------|---------|----------|
| {topic_id} | ... | ... | ... |

Do not add extra commentary.
""".strip()


In [68]:
def build_prompt(topic_id: int, chunk_examples: list[str]) -> str:
    chunk_examples_block = "\n".join([f"- {i+1}. {text}" for i, text in enumerate(chunk_examples)])
    return PROMPT.format(topic_id=topic_id, chunk_examples=chunk_examples_block)

prompt = build_prompt(t, examples)
print(prompt[:1500])

You are an expert annotator analyzing a latent conversation topic.
All the text chunks below come from the same topic.

### Topic ID: 0

### Example text chunks
- 1. people just don't know as much, so it's better for them to take their time and figure it out rather than just like going, spending too much money on school and having to be Square Hey stomach? No. Yeah. Yeah. The bad thing about having cats in the apartment is they like to destroy everything? What? Yeah. crowd Oh my God, she's so bad. Hi. She's already destroyed All all the blinds in my apartment. Will not off. There's still one that's All going strong but I know for sure I have to replace the risk when I'm going away. Okay. apartment blinds don't really hold up very well.
- 2. I don't know, I'm scared for him because only because like when he sleeps, he snores like a person, I'm like, oh God, I'm like, it's getting worse. I'm like, it's because we also spayed or neutered, I'm not sure what the term is, but we did that to 