### __TREC__

Removes the label-fine column in the TREC train data set as it is not needed in the project.  
Splits the data into train and test sets and exports them

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random

In [2]:
TREC_dir = '../Datasets/TREC_dataset/'
train = f'{TREC_dir}/train.csv'

In [3]:
df = pd.read_csv(train)
df

Unnamed: 0,label-coarse,label-fine,text
0,0,0,How did serfdom develop in and then leave Russ...
1,1,1,What films featured the character Popeye Doyle ?
2,0,0,How can I find a list of celebrities ' real na...
3,1,2,What fowl grabs the spotlight after the Chines...
4,2,3,What is the full form of .com ?
...,...,...,...
5447,1,14,What 's the shape of a camel 's spine ?
5448,1,46,What type of currency is used in China ?
5449,4,41,What is the temperature today ?
5450,4,41,What is the temperature for cooking ?


In [4]:
df = df.drop(columns=['label-fine'])
df

Unnamed: 0,label-coarse,text
0,0,How did serfdom develop in and then leave Russ...
1,1,What films featured the character Popeye Doyle ?
2,0,How can I find a list of celebrities ' real na...
3,1,What fowl grabs the spotlight after the Chines...
4,2,What is the full form of .com ?
...,...,...
5447,1,What 's the shape of a camel 's spine ?
5448,1,What type of currency is used in China ?
5449,4,What is the temperature today ?
5450,4,What is the temperature for cooking ?


In [5]:
# Get list of unique label-coarse
labels = df['label-coarse'].unique() # returns an array([])
labels = list(labels) 
labels

[0, 1, 2, 3, 4, 5]

In [6]:
# select 2 random choices
choices = random.sample(labels, 2)
choices

[4, 2]

In [7]:
new_df = df.copy()
new_df

for index, row in new_df.iterrows():
    label_coarse = new_df.loc[index, 'label-coarse']
    if label_coarse in choices:
        new_df.loc[index, 'label-coarse'] = 'OTHERS'

new_df


  new_df.loc[index, 'label-coarse'] = 'OTHERS'


Unnamed: 0,label-coarse,text
0,0,How did serfdom develop in and then leave Russ...
1,1,What films featured the character Popeye Doyle ?
2,0,How can I find a list of celebrities ' real na...
3,1,What fowl grabs the spotlight after the Chines...
4,OTHERS,What is the full form of .com ?
...,...,...
5447,1,What 's the shape of a camel 's spine ?
5448,1,What type of currency is used in China ?
5449,OTHERS,What is the temperature today ?
5450,OTHERS,What is the temperature for cooking ?


In [8]:
# Split the DataFrame into training and test sets
train_df, test_df = train_test_split(new_df, test_size=0.2, shuffle=True)
train_df_rest, train_df_dev = train_test_split(train_df, test_size=500, shuffle=True)

print("Training set (dev):")
print(train_df_dev)

print("Training set:")
print(train_df_rest)

print("\nTest set:")
print(test_df)

Training set (dev):
     label-coarse                                               text
3911       OTHERS     Hitler came to power in Germany in what year ?
4453            3                   What company produces Spumante ?
3631            0                         Why is hockey so violent ?
427             1                     What kind of animal is Babar ?
444             0                  What does Inuit and Eskimo mean ?
...           ...                                                ...
1556            3                        Who wrote Sons and Lovers ?
1168       OTHERS  What phone number can I call to have a tree pl...
1297            0                        What are amicable numbers ?
2558            5  What country was Brian Boru an 11th-century ki...
385             5  What 's the closest G2 Spectrum Yellow Dwarf t...

[3861 rows x 2 columns]
Training set:
     label-coarse                                               text
1552            5  What African country is g

In [9]:
path = '../Datasets/Processed/TREC/'

# Export DataFrame to a CSV file
train_df_rest.to_csv(f'{path}train.csv', index=False)
train_df_dev.to_csv(f'{path}train.dev.csv', index=False)
test_df.to_csv(f'{path}test.csv', index=False)