# FineTune - Debate Bots (Preprocessing Notebook) 

### Dependencies

In [13]:
import os
import re
import pandas as pd
import numpy as np
import json

### Filepaths

In [2]:
RAW_DF_FILEPATH = os.path.join("Data","debate_dataset_40000.csv")
TRAIN_DF_FILEPATH = os.path.join("Finegrained_Data","train.jsonl")
TEST_DF_FILEPATH = os.path.join("Finegrained_Data","test.jsonl")
VAL_DF_FILEPATH = os.path.join("Finegrained_Data","val.jsonl")

### Preparing the Data

In [3]:
df = pd.read_csv(RAW_DF_FILEPATH)
df.set_index(df["topic_id"])
df.drop(columns=["topic_id"],inplace = True)
df.head()

Unnamed: 0,topic,side,argument
0,AI regulation #1,pro,Helps reduce negative impacts related to AI re...
1,AI regulation #1,pro,Addresses urgent issues in AI regulation.
2,AI regulation #1,pro,Necessary for long-term sustainability of AI r...
3,AI regulation #1,pro,Necessary for long-term sustainability of AI r...
4,AI regulation #1,pro,Increases public trust in AI regulation.


In [11]:
df.describe()

Unnamed: 0,topic,side,argument
count,400000,400000,400000
unique,20,2,400
top,AI regulation,pro,Necessary for long-term sustainability of Cryp...
freq,20000,200000,1082


In [4]:
df["topic"] = df["topic"].map(lambda x: re.sub("[#0-9]","",x))
df.head()

Unnamed: 0,topic,side,argument
0,AI regulation,pro,Helps reduce negative impacts related to AI re...
1,AI regulation,pro,Addresses urgent issues in AI regulation.
2,AI regulation,pro,Necessary for long-term sustainability of AI r...
3,AI regulation,pro,Necessary for long-term sustainability of AI r...
4,AI regulation,pro,Increases public trust in AI regulation.


In [5]:
records = []
for _,row in df.iterrows():
    topic = row["topic"]
    side = row["side"]
    args = row["argument"]

    prompt = f"You are a professional debater,Generate a {side} arguement on the topic: {topic}"
    records.append({"text":prompt,"output":args})
records[:5]

[{'text': 'You are a professional debater,Generate a pro arguement on the topic: AI regulation ',
  'output': 'Helps reduce negative impacts related to AI regulation.'},
 {'text': 'You are a professional debater,Generate a pro arguement on the topic: AI regulation ',
  'output': 'Addresses urgent issues in AI regulation.'},
 {'text': 'You are a professional debater,Generate a pro arguement on the topic: AI regulation ',
  'output': 'Necessary for long-term sustainability of AI regulation.'},
 {'text': 'You are a professional debater,Generate a pro arguement on the topic: AI regulation ',
  'output': 'Necessary for long-term sustainability of AI regulation.'},
 {'text': 'You are a professional debater,Generate a pro arguement on the topic: AI regulation ',
  'output': 'Increases public trust in AI regulation.'}]

In [8]:
len(records)

400000

In [12]:
train = records[:280000]
test = records[280000:360000]
val = records[360000:]

print(f"train: {len(train)}")
print(f"test: {len(test)}")
print(f"val: {len(val)}")

train: 280000
test: 80000
val: 40000


### Persisting the new Data

In [14]:
with open(TRAIN_DF_FILEPATH,"w") as filepath:
    for entry in train:
        filepath.write(json.dumps(entry) + "\n")

with open(TEST_DF_FILEPATH,"w") as filepath:
    for entry in test:
        filepath.write(json.dumps(entry) + "\n")


with open(VAL_DF_FILEPATH,"w") as filepath:
    for entry in val:
        filepath.write(json.dumps(entry) + "\n")