# Data Generation using LLM

In [11]:
from langchain.prompts import FewShotPromptTemplate,PromptTemplate
from langchain.chat_models import ChatOpenAI
import openai
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import numpy as np
from openai import OpenAI

In [42]:
def get_completion(prompt: str, llm_model="gpt-3.5-turbo"):
    openai.api_key = "sk-proj-iFvvNvmeLnKPefr7Kfv6T3BlbkFJesMmrNffwDul08FUJ6C3"
    client = OpenAI(api_key=openai.api_key)
    messages = [{"role":"user","content":prompt}]
    response =client.chat.completions.create(model=llm_model,
                                           messages = messages,
                                           )
    return response.choices[0].message.content.strip()

In [118]:
def generate_text_data(data: pd.DataFrame, 
                       category:str,
                       use_samples: int,
                       number_of_tweets: int,
                      llm_model="gpt-4o-mini"):
    data = data[data["tweet"]!=""]
    cat_data = data[data["Disorder"]==category]
    cat_data.reset_index(drop=True,inplace=True)
    samples = np.random.choice(range(len(cat_data)),use_samples,replace=False)
    cat_data = cat_data.filter(items=samples, axis=0)
    tweets = cat_data["tweet"].values.tolist()
    tweets = "\n".join(tweets)
    prompt = f"""Given the following tweets:\n\n{tweets}\n\nGenerate `{number_of_tweets}` more unique tweets using above tweets. Generated `{number_of_tweets}` tweets should be different from above tweets.`Do not add any extra text except tweets`"""
    prompt = prompt.replace("{", "{{").replace("}", "}}")
    prompt_template = PromptTemplate(input_variables=["tweet"], template=prompt)
    response = get_completion(prompt,llm_model=llm_model)
    return response

In [44]:
data_path = Path.cwd().parent/"Data"
final_data = pd.DataFrame()

In [45]:
for child in data_path.iterdir():
    for child_ch in child.iterdir():
        if child_ch.is_dir():
            files = list(child_ch.glob("**/*.csv"))
            for fls in tqdm(range(len(files))):
                df = pd.read_csv(files[fls])
        else:
            df = pd.read_csv(child_ch)
        final_data = pd.concat([df,final_data],axis = 0)

100%|████████████████████████████████████████████████████████████████████| 622/622 [00:06<00:00, 89.76it/s]
100%|████████████████████████████████████████████████████████████████████| 124/124 [00:01<00:00, 95.57it/s]
100%|████████████████████████████████████████████████████████████████████| 170/170 [00:01<00:00, 94.42it/s]
100%|████████████████████████████████████████████████████████████████████| 136/136 [00:01<00:00, 98.70it/s]
100%|██████████████████████████████████████████████████████████████████| 1703/1703 [00:25<00:00, 65.77it/s]
100%|███████████████████████████████████████████████████████████████████| 249/249 [00:02<00:00, 100.39it/s]
100%|██████████████████████████████████████████████████████████████████████| 26/26 [00:00<00:00, 84.54it/s]
100%|██████████████████████████████████████████████████████████████████████| 65/65 [00:00<00:00, 78.10it/s]
100%|████████████████████████████████████████████████████████████████████| 127/127 [00:01<00:00, 89.19it/s]
100%|███████████████████████

In [46]:
final_data = final_data[["class","tweet"]]
final_data.rename(columns={"class":"Disorder"}, inplace=True)
final_data.head()

Unnamed: 0,Disorder,tweet
0,SCHIZOPHRENIA,"""Sally 🤍🤍 we wont forget Angel. HTTPURL"""
1,SCHIZOPHRENIA,"""@USER Personally my life is split in two, eve..."
2,SCHIZOPHRENIA,"""@USER I can envisage your sufferance look sur..."
3,SCHIZOPHRENIA,"""@USER @USER If you just take more responsibil..."
4,SCHIZOPHRENIA,"""@USER Im so sorry, so angry and so want to sl..."


# Binary-class samples

In [9]:
binary_samples = (final_data["Disorder"].value_counts().sum()-final_data["Disorder"].value_counts()["CONTROL"]) - \
                                                        final_data["Disorder"].value_counts()["CONTROL"]
use_samples = 200
generate_samples = 100
for samples in tqdm(range(binary_samples//generate_samples)):
    generated_data = generate_text_data(final_data,"CONTROL",use_samples,generate_samples)
    generated_df = pd.DataFrame(data={"tweet":generated_data.split("\n")})
    generated_df["Disorder"]="CONTROL"
    final_data = pd.concat([final_data,generated_df], axis=0)
    final_data = final_data.sample(len(final_data))

In [15]:
final_data.Disorder.value_counts()

Disorder
ADHD               3034
SCHIZOPHRENIA      2970
OCD                2905
ANXIETY            2729
CONTROL            2631
PTSD               2466
DEPRESSION         2161
AUTISM             1425
EATING DISORDER     403
BIPOLAR             244
Name: count, dtype: int64

# Multi-class samples

In [114]:
final_data = x[x["Disorder"].isin(["AUTISM"])]
final_data.Disorder.value_counts()

Disorder
AUTISM    1509
Name: count, dtype: int64

In [119]:
num_samples = 5000
multi_class_samples = final_data["Disorder"].value_counts().to_dict()
out_of_band_samples = dict()
for cls,samples in multi_class_samples.items():
    if cls!="CONTROL":
        print(cls)
        out_of_band_samples.update({cls:num_samples-multi_class_samples[cls]})
        for samples in tqdm(range(out_of_band_samples[cls]//generate_samples)):
            generated_data = generate_text_data(final_data,cls,use_samples,generate_samples)
            generated_df = pd.DataFrame(data={"tweet":generated_data.split("\n")})
            generated_df["Disorder"]=cls
            final_data = pd.concat([final_data,generated_df], axis=0)
            final_data = final_data.sample(len(final_data)) 

AUTISM


100%|██████████████████████████████████████████████████████████████████████| 28/28 [09:46<00:00, 20.94s/it]


In [120]:
out_of_band_samples

{'AUTISM': 2840}

In [121]:
x = pd.read_csv("multi_class_data_balanced.csv")
final_data.reset_index(drop=True,inplace=True)
y = pd.concat([x,final_data], axis=0)
# y = y[y["Disorder"]!="AUTISM"]
y = y.sample(len(y))

In [102]:
final_data

Unnamed: 0,Disorder,tweet
20,EATING DISORDER,"21. ""What if I started each day with a heart f..."
27,EATING DISORDER,"28. ""What if I allowed myself to be a work in ..."
89,BIPOLAR,"90. ""What’s something you enjoy doing alone? 🌌"""
172,EATING DISORDER,"""relapsing all January"""
11,BIPOLAR,"12. ""What’s something that always makes you sm..."
...,...,...
72,EATING DISORDER,"73. ""How do I engage in self-reflection to fos..."
45,EATING DISORDER,"46. ""What if I used the lessons from my past t..."
2,EATING DISORDER,"3. ""What if every challenge is a chance to red..."
30,EATING DISORDER,"""What if my happy moments are just distraction..."


In [124]:
# y.to_csv("multi_class_data_balanced.csv",index=False)

In [98]:
# final_data.to_csv("control_data.csv",index=False)

In [123]:
y.Disorder.value_counts()

Disorder
EATING DISORDER    5285
BIPOLAR            5189
SCHIZOPHRENIA      4970
PTSD               4945
AUTISM             4883
OCD                4860
ADHD               4823
DEPRESSION         4823
ANXIETY            4717
CONTROL            2532
Name: count, dtype: int64

In [113]:
x.Disorder.value_counts()

Disorder
SCHIZOPHRENIA      4970
PTSD               4945
OCD                4860
DEPRESSION         4823
ADHD               4823
ANXIETY            4717
CONTROL            2532
AUTISM             1509
EATING DISORDER     403
BIPOLAR             244
Name: count, dtype: int64

In [110]:
y

Unnamed: 0,Disorder,tweet
0,DEPRESSION,"""Its official! Im going to get to watch the BN..."
1,CONTROL,"""A bigger racismo club than chelski HTTPURL HT..."
2,ANXIETY,"14. ""Planning a picnic this weekend! What snac..."
3,SCHIZOPHRENIA,"""@USER This is exactly what I do, the polar op..."
4,PTSD,"63. ""It’s a journey to learn that our value is..."
...,...,...
9822,EATING DISORDER,"73. ""How do I engage in self-reflection to fos..."
9823,EATING DISORDER,"46. ""What if I used the lessons from my past t..."
9824,EATING DISORDER,"3. ""What if every challenge is a chance to red..."
9825,EATING DISORDER,"""What if my happy moments are just distraction..."


In [112]:
x

Unnamed: 0,Disorder,tweet
0,DEPRESSION,"""Its official! Im going to get to watch the BN..."
1,CONTROL,"""A bigger racismo club than chelski HTTPURL HT..."
2,ANXIETY,"14. ""Planning a picnic this weekend! What snac..."
3,SCHIZOPHRENIA,"""@USER This is exactly what I do, the polar op..."
4,PTSD,"63. ""It’s a journey to learn that our value is..."
...,...,...
33821,ANXIETY,"""@USER I WANT TO DO EVERYTHING ALL AT ONCE"""
33822,OCD,"36. ""Pushing boundaries in my creative work ke..."
33823,ANXIETY,"""@USER @USER There was no way I was about to l..."
33824,PTSD,"""I get soo much work, all kinds, from people t..."
