In [8]:
from pathlib import Path
import csv
import glob
import random

path_to_data = Path('Data/CleanedData/')

def write_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['text', 'categories']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for quote, categories in data.items():
            writer.writerow({'text': quote, 'categories': ';'.join(categories)})

def split_data(data, train=0.7, val=0.15, test=0.15):
    total = len(data)
    train_count = int(total * train)
    val_count = int(total * val)
    test_count = total - train_count - val_count

    items = list(data.items())
    random.shuffle(items)
    train_data = dict(items[:train_count])
    val_data = dict(items[train_count:train_count + val_count])
    test_data = dict(items[train_count + val_count:])
    return train_data, val_data, test_data

quote_dict = {}

for filename in glob.glob(str(path_to_data / '*.txt')):
    category = Path(filename).stem
    with open(filename, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip().replace("“", "").replace("”", "")
        if line in quote_dict:
            quote_dict[line].append(category)
        else:
            quote_dict[line] = [category]

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_data, val_data, test_data = split_data(quote_dict, train=train_ratio, val=val_ratio, test=test_ratio)

write_to_csv(train_data, str(path_to_data / 'train_quotes.csv'))
write_to_csv(val_data, str(path_to_data / 'val_quotes.csv'))
write_to_csv(test_data, str(path_to_data / 'test_quotes.csv'))


In [9]:
summary = {
    'total' : len(quote_dict),
    'train' : len(train_data),
    'val' : len(val_data),
    'test' : len(test_data)
}

import pandas as pd

print(pd.DataFrame(summary, index=[0]).to_string(index=False))

 total  train  val  test
 63927  51141 6392  6394
