# Data Wrangling for chatbot's dataset

In [1]:
import pandas as pd

In [2]:
data1 = pd.read_csv("chatbot2.csv")
data2 = pd.read_csv("chatbot3.csv")

In [3]:
data1.head()

Unnamed: 0.1,Unnamed: 0,text
0,0,<s>[INST] What is diabetes mellitus? [/INST] D...
1,1,<s>[INST] What are the classic symptoms of dia...
2,2,<s>[INST] What are the major types of diabetes...
3,3,<s>[INST] What are the long-term complications...
4,4,<s>[INST] How is diabetes treated? [/INST] Typ...


In [4]:
data2.head()

Unnamed: 0.1,Unnamed: 0,question,answer
0,0,"What are some examples of calorie-free, caffei...","Examples of calorie-free, caffeine-free liquid..."
1,1,Why is it important for individuals with diabe...,Staying hydrated is crucial for regulating blo...
2,2,What type of gelatin is recommended for indivi...,Individuals with diabetes should choose gelati...
3,3,In what situations should an individual with d...,Individuals with diabetes should contact their...
4,4,Why is it important for individuals with diabe...,Wearing medical identification alerts others t...


In [5]:
data1 = data1[["text"]]

In [6]:
data1.head()

Unnamed: 0,text
0,<s>[INST] What is diabetes mellitus? [/INST] D...
1,<s>[INST] What are the classic symptoms of dia...
2,<s>[INST] What are the major types of diabetes...
3,<s>[INST] What are the long-term complications...
4,<s>[INST] How is diabetes treated? [/INST] Typ...


In [7]:
data2 = data2[["question", "answer"]]

In [8]:
data2.head()

Unnamed: 0,question,answer
0,"What are some examples of calorie-free, caffei...","Examples of calorie-free, caffeine-free liquid..."
1,Why is it important for individuals with diabe...,Staying hydrated is crucial for regulating blo...
2,What type of gelatin is recommended for indivi...,Individuals with diabetes should choose gelati...
3,In what situations should an individual with d...,Individuals with diabetes should contact their...
4,Why is it important for individuals with diabe...,Wearing medical identification alerts others t...


In [9]:
data2["final"] = data2["question"] + ":" + data2["answer"]

In [10]:
data2["final"].iloc[0]

'What are some examples of calorie-free, caffeine-free liquids that individuals with diabetes can consume when sick?:Examples of calorie-free, caffeine-free liquids include water, unsweetened tea, and clear broth.'

In [11]:
data1["text"].iloc[0]

'<s>[INST] What is diabetes mellitus? [/INST] Diabetes mellitus, often known simply as diabetes, is a group of common endocrine diseases characterized by sustained high blood sugar levels. </s>'

In [12]:
import re

def format_question_answer(text):
    """
    Remove unnecessary tags and format text as 'question:answer'.
    """
    # Extract the question using regex
    question = re.search(r"\[INST\](.*?)\[/INST\]", text)
    question = question.group(1).strip() if question else ""
    
    # Extract the answer using regex
    answer = re.search(r"\[/INST\](.*?)</s>", text)
    answer = answer.group(1).strip() if answer else ""
    
    # Return formatted string
    return f"{question}:{answer}"

In [13]:
# Apply the function to the column
data1["final"] = data1["text"].apply(format_question_answer)
data1.head()

Unnamed: 0,text,final
0,<s>[INST] What is diabetes mellitus? [/INST] D...,"What is diabetes mellitus?:Diabetes mellitus, ..."
1,<s>[INST] What are the classic symptoms of dia...,What are the classic symptoms of diabetes?:Cla...
2,<s>[INST] What are the major types of diabetes...,What are the major types of diabetes?:The majo...
3,<s>[INST] What are the long-term complications...,What are the long-term complications of diabet...
4,<s>[INST] How is diabetes treated? [/INST] Typ...,How is diabetes treated?:Type 1 diabetes is co...


In [14]:
data1["final"].iloc[0]

'What is diabetes mellitus?:Diabetes mellitus, often known simply as diabetes, is a group of common endocrine diseases characterized by sustained high blood sugar levels.'

In [16]:
combined_col = pd.concat([data1["final"], data2["final"]], ignore_index=True)
new_df = pd.DataFrame({"combined": combined_col})
new_df.head()

Unnamed: 0,combined
0,"What is diabetes mellitus?:Diabetes mellitus, ..."
1,What are the classic symptoms of diabetes?:Cla...
2,What are the major types of diabetes?:The majo...
3,What are the long-term complications of diabet...
4,How is diabetes treated?:Type 1 diabetes is co...


In [17]:
data1.shape

(143, 2)

In [18]:
data2.shape

(1075, 3)

In [19]:
new_df.shape

(1218, 1)

In [20]:
143 + 1075

1218

In [22]:
new_df.to_csv("chatbot_dataset.txt", index=False, header=False)