In [None]:
!pip install datasets
!pip install together

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset, Dataset

from google.colab import drive
drive.mount('/content/drive', force_remount = True)
import os
os.chdir('/content/drive/MyDrive/nlp_final_proj/data')

import re
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from string import punctuation
import gensim
from gensim import corpora


from openai import OpenAI
# Here we use OpenAI's api
client1 = OpenAI(
    api_key=""
)

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def tokenize_text(text):
    text = re.sub(r"\s+", " ", text.strip())
    tokens = nltk.word_tokenize(text)
    punctuation_set = set(punctuation)


    stopwords_en = set(stopwords.words("english"))
    stopwords_en.update(["'s","''","``",'also','–','||','‘',"n't",'’',"'m",'go','...','could','like','get',"'ve",'would',"'re",'one'])
    cleaned_tokens = []
    for token in tokens:
        token_lower = token.lower()
        if token_lower not in punctuation_set and token_lower not in stopwords_en:
            cleaned_tokens.append(token_lower)
    return cleaned_tokens

def analyze_tokens(tokens, top_n=10, bottom_n=10):
    """
    Analyzes tokens by counting frequencies and printing various statistics.
    """
    counter = Counter(tokens)

    total_tokens = sum(counter.values())
    vocab_size = len(counter)

    print(f"Total tokens: {total_tokens}")
    print(f"Vocabulary size: {vocab_size}")
    print(f"Top {top_n} most frequent tokens:")
    for word, freq in counter.most_common(top_n):
        print(f"  {word} : {freq}")

    print(f"\n{bottom_n} rarest tokens :")
    for word, freq in counter.most_common()[-bottom_n:]:
        print(f"  {word} : {freq}")

    rare_tokens = [word for word, freq in counter.items() if freq == 1]
    rare_tokens_5 = [word for word, freq in counter.items() if freq < 5]
    print('\n')
    print(f"Number of singletons: {len(rare_tokens)}")
    print(f"Percentage of singletons: {(len(rare_tokens)/total_tokens):.2f}")
    print(f"Percentage of tokens mentioned less than 5 times: {(len(rare_tokens_5)/total_tokens):.2f}")
    return counter

def analyze_sequence_lengths(text_list):
    all_lengths = []
    text_seqs = []
    for text in text_list:
        sequences = sent_tokenize(text)
        text_seqs.append(len(sequences))

        for seq in sequences:
            tokens = word_tokenize(seq)
            seq_length = len(tokens)
            all_lengths.append(seq_length)

    min_length = min(all_lengths)
    max_length = max(all_lengths)
    avg_length = np.array(all_lengths).mean()
    median_length = np.median(np.array(all_lengths))
    avg_seq_text = np.array(text_seqs).mean()

    print('\n')
    print(f"Number of sequences: {len(all_lengths)}")
    print(f"Min sequence length:  {min_length}")
    print(f"Max sequence length:  {max_length}")
    print(f"Avg sequence length:  {avg_length:.2f}")
    print(f"Avg number of sequences per text:  {avg_seq_text:.2f}")
    print(f"Median sequence length: {median_length:.2f}")
    print('\n')

    return all_lengths

def build_lda_model(documents, num_topics=10, passes=10, num_words=20):
    tokenized_docs = [tokenize_text(doc) for doc in documents]
    dictionary = corpora.Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

    lda_model = gensim.models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=passes,
        update_every=1,
        chunksize=100
    )

    for idx, topic in lda_model.show_topics(formatted=False, num_words=num_words):
      prompt = (
          f"Summarize a topic characterized by these top words from a LDA model trained on wikipedia dataset: "
          f"{', '.join([term for term, _ in topic])}.\n"
          "Give me a few words (2-3) to summaize all words given above."
      )

      response = client.chat.completions.create(
          model="gpt-4o",
          messages=[
              {"role": "developer", "content": "You are a helpful assistant."},
              {"role": "user", "content": prompt}
          ]
      )

      summary = response.choices[0].message.content.strip()
      print(f"Topic {idx} (Top words: {' '.join([term for term, _ in topic])})")
      print("Summary:", summary)
      print()


    return lda_model, dictionary, corpus

def starts(s: str) -> bool:
    """
    Returns True if the string s starts with a digit (0-9) or a letter (A-Za-z),
    otherwise False.
    """
    pattern = r'^[0-9A-Za-z]'
    return bool(re.match(pattern, s))

In [None]:
ds = load_dataset("ju-resplande/askD")

data = pd.concat([pd.DataFrame(ds['train_en']),
                  pd.DataFrame(ds['validation_en']),
                  pd.DataFrame(ds['test_en'])]).sample(frac = 1,replace = False).reset_index(drop = True)

data['output'] = data['answers'].apply(lambda x: x['text'])
data['score'] = data['answers'].apply(lambda x: x['score'])
data = data[data['selftext'] != '[deleted]']
data['question_id'] = range(len(data))

expand_data = []
for idx, row in data.iterrows():
  for output, score in zip(row['output'], row['score']):
    expand_data.append({
          "question_id": row['question_id'],
          "title": row['title'],
          "question": row['selftext'],
          "output": output,
          "score": score
      })

expand_data = pd.DataFrame(expand_data)
expand_data['output_len'] = expand_data['output'].apply(lambda x: len(tokenize_text(x)))
expand_data = expand_data[expand_data['output_len'] > 25]
expand_data = expand_data[expand_data['output'].apply(lambda x: 'URL' not in x and 'http' not in x and 'dtype' not in x)]
expand_data = expand_data[expand_data['question'].apply(lambda x: 'URL' not in x and 'http' not in x and 'dtype' not in x)]
expand_data = expand_data[expand_data['output'].apply(lambda x: starts(x))]
expand_data = expand_data.sort_values('score', ascending = False).iloc[:20000].sample(frac = 1, replace = False).reset_index(drop = True)
train_data = expand_data[expand_data['question_id'] <= 21738].reset_index(drop = True)
better_train = train_data.sort_values('score', ascending = False).iloc[:5000].sample(frac = 1, replace = False).reset_index(drop = True)
test_data = expand_data[expand_data['question_id'] > 21738].reset_index(drop = True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

askD.py:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

The repository for ju-resplande/askD contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ju-resplande/askD.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/45.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/180M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/165M [00:00<?, ?B/s]

Generating train_pt split: 0 examples [00:00, ? examples/s]

Generating train_en split: 0 examples [00:00, ? examples/s]

Generating validation_pt split: 0 examples [00:00, ? examples/s]

Generating validation_en split: 0 examples [00:00, ? examples/s]

Generating test_pt split: 0 examples [00:00, ? examples/s]

Generating test_en split: 0 examples [00:00, ? examples/s]

Generating external_pt split: 0 examples [00:00, ? examples/s]

Generating external_en split: 0 examples [00:00, ? examples/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['question_id'] = range(len(data))


In [None]:
test_data[['title','question','output','score']].rename(columns = {"question":'selftext','output':'answers'})

Unnamed: 0,title,selftext,answers,score
0,Any alarm in what the computer recommends vers...,Sorry if this isn’t the correct place to ask t...,Last week one of my colleagues had a computer ...,7
1,A girl I know got hit in the back with a hamme...,"So there's this girl I know, she's 15, fairly ...","Thank you all for your thoughts and help, I ca...",26
2,"Vomiting bile every morning, can't eat until l...",I'm a 20 year old male. I've had a loss of mor...,Have you had any sort of imaging done? You cou...,5
3,Sister is Sick Weeks after C-Section. Does any...,Dear Reddit Community:\n\nI hope in some way t...,Firstly sorry she is going through this.\n\nTh...,42
4,"[SERIOUS] Dad had a Cardiac Arrest, chance of ...",Dad had a heart attack earlier this week (sund...,"Obviously every situation is different, but to...",2
...,...,...,...,...
4343,Stage 4 lung Adenocarcinoma- Male- 6’- 230ish ...,This is actually about my husband. He was pres...,Are you certain it's the oxycodone? If so is i...,4
4344,Small bump near my nipple?,Ok well this is awkward....\n\n(16/M)\n\n\nTod...,Yeah...not normal for a mom to do that...\n\nD...,2
4345,Is it safe to take lactaid pills when i eat da...,i have a feeling i am but ummm a very picky ea...,"Yes, although I tried this and still within ha...",4
4346,A Serious cry for help!,Hello\n\nI have been having leg pains for near...,It sounds like your physician is ordering a CT...,3


In [None]:
PROMPT1 = """You are a knowledgeable and empathetic medical expert responding to patient questions. Your goal is to generate concise, accurate, and helpful advice in a single paragraph, reflecting the tone and clarity of highly rated AskDoc answers from reddit. When crafting your response:
- Provide medically accurate, evidence-based insights.
- Offer practical steps or remedies and recommend a doctor’s visit if necessary.
- Use clear, non-technical language while retaining authority.
- Output must be exactly one paragraph, containing 50-250 words in total.

Patient’s AskDoc post content:
- Title: {title}.
- Additional Information: {question}.
"""

PROMPT2 = """You are a supportive community member on a health related forum like Reddit’s AskDoc. Your goal is to provide concise, accurate, and empathetic replies in a single paragraph, reflective of both experienced patient insights or professional medical perspectives. When crafting your response:
- Offer well-researched, evidence-based information whenever possible.
- Include practical suggestions or remedies, and recommend seeing a healthcare provider when necessary.
- Use clear, accessible language while being mindful not to overstep your level of expertise.
- Your reply must be exactly one paragraph, containing 50-250 words in total.

Patient’s AskDoc post content:
- Title: {title}.
- Additional Information: {question}.
"""

train_data['prompt1'] = train_data.apply(lambda x:PROMPT1.format(title = x['title'], question = x['question']), axis = 1)
train_data['prompt2'] = train_data.apply(lambda x:PROMPT2.format(title = x['title'], question = x['question']), axis = 1)
better_train['prompt1'] = better_train.apply(lambda x:PROMPT1.format(title = x['title'], question = x['question']), axis = 1)
better_train['prompt2'] = better_train.apply(lambda x:PROMPT2.format(title = x['title'], question = x['question']), axis = 1)
test_data['prompt1'] = test_data.apply(lambda x:PROMPT1.format(title = x['title'], question = x['question']), axis = 1)
test_data['prompt2'] = test_data.apply(lambda x:PROMPT2.format(title = x['title'], question = x['question']), axis = 1)

In [10]:
better_train

Unnamed: 0,question_id,title,question,output,score,output_len,prompt1,prompt2
0,2345,"I got tasered three days ago, and I feel no be...","Last Friday, a group of kids (yes, they all we...",You were tasered and told the police and they ...,9,56,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
1,21202,"50+DAYS OF SEIZURES, UNKNOWN DIAGNOSIS, SEDATE...",25YRS OLD. MALE. AUSTIN TEXAS. SEDATED FOR 50+...,Unfortunately I don’t think you’re going to fi...,13,56,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
2,15262,Hit my head *very* hard and now I'm worried.,"I'm a Caucasian, 20 year old male, 200 lbs and...",MD here. These are all concerning things. Plea...,13,75,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
3,17710,I’ve never been able to insert anything in my ...,Mandatory info:\n\n• Age: 20\n• Sex: Female\n•...,"This is going to sound weird, but are you cert...",24,26,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
4,18174,"6 y/o son has unexplained fevers, failure to g...",• Age 6 years \n • Sex: male\n • Height: 42.5 ...,Not a doctor but my son had these symptoms as ...,7,69,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
...,...,...,...,...,...,...,...,...
4995,11643,Why do I smell honey after I sneeze?,I sneeze a lot. It's a thing that runs in the ...,Well...I'm not a doctor. But I do have Type 1 ...,7,90,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
4996,7097,"No sensation of needing to urinate, despite pe...","Alright, this is a little embarrassing but I'm...",It's possible it's related to the celiac disea...,9,26,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
4997,5835,What is the point of getting the pre-exposure ...,Age: 23\n\nSex: Female\n\nWeight: 133\n\nHeigh...,I believe the difference is that if you get pr...,13,27,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...
4998,8244,Alcoholism help,I’m a 25 year old male 165lb 5’9 Colorado USA....,Does this good paying job have health insuranc...,6,27,You are a knowledgeable and empathetic medical...,You are a supportive community member on a hea...


In [None]:
jsonl_data = []
for _, row in train_data.iterrows():
    entry = {
        "prompt": row["prompt2"],
        "completion": row["output"]
    }
    jsonl_data.append(entry)

output_file = "fine_tuning_train_data.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
jsonl_data = []
for _, row in better_train.iterrows():
    entry = {
        "prompt": row["prompt2"],
        "completion": row["output"]
    }
    jsonl_data.append(entry)

output_file = "fine_tuning_better_train_data.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

In [None]:
jsonl_data = []
for _, row in test_data.iterrows():
    entry = {
        "prompt": row["prompt2"],
        "completion": row["output"]
    }
    jsonl_data.append(entry)

output_file = "fine_tuning_test_data.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")