In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [3]:
from pathlib import Path
import csv
import glob
import random

path_to_data = Path('drive/MyDrive/quotes')

def write_to_csv(data, filename):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['text', 'categories']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for quote, categories in data.items():
            writer.writerow({'text': quote, 'categories': ';'.join(categories)})

def split_data(data, train=0.7, val=0.15, test=0.15):
    total = len(data)
    train_count = int(total * train)
    val_count = int(total * val)
    test_count = total - train_count - val_count

    items = list(data.items())
    random.shuffle(items)
    train_data = dict(items[:train_count])
    val_data = dict(items[train_count:train_count + val_count])
    test_data = dict(items[train_count + val_count:])
    return train_data, val_data, test_data

quote_dict = {}

for filename in glob.glob(str(path_to_data / '*.txt')):
    category = Path(filename).stem
    with open(filename, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip().replace("“", "").replace("”", "")
        if line in quote_dict:
            quote_dict[line].append(category)
        else:
            quote_dict[line] = [category]

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_data, val_data, test_data = split_data(quote_dict, train=train_ratio, val=val_ratio, test=test_ratio)

write_to_csv(train_data, str(path_to_data / 'train_quotes.csv'))
write_to_csv(val_data, str(path_to_data / 'val_quotes.csv'))
write_to_csv(test_data, str(path_to_data / 'test_quotes.csv'))


In [4]:
summary = {
    'total' : len(quote_dict),
    'train' : len(train_data),
    'val' : len(val_data),
    'test' : len(test_data)
}

import pandas as pd

print(pd.DataFrame(summary, index=[0]).to_string(index=False))

 total  train  val  test
 63927  51141 6392  6394


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
.vector_cache/glove.twitter.27B.zip: 1.52GB [04:45, 5.32MB/s]                            
100%|█████████▉| 1193513/1193514 [00:55<00:00, 21579.64it/s]


spiritual: Some people are angels sent to you by God at your weakest moments.
love: True love is rare, and it's the only thing that gives life real meaning.
death: The kiss. Oh, the kiss. What a perfect, unnerving, luscious kiss. He made me feel unhinged . . . like he could take me apart and put me back together again and again.
love: Be a bit of a challenge; not because you're playing games but because you realize you're worth the extra effort.
happiness;inspirational;knowledge: My only regrets are the moments when i doubted myself and took the safe route. Life is too short to waste time being unhappy.
death: Death is part of who we are. It guides us. It shapes us. It drives us to madness. Can you still be human if you have no mortal end
inspirational: Now you've done it all,certified people with your life tell me your results what is your benefits?
death: Too much of water hast thou poor Ophelia, and therefore I forbid my tears. But yet it is our trick, let shame say what it will. wh

AttributeError: ignored

In [None]:
dataset[0][0].size()

In [7]:
from collections import Counter

def count_label_combinations(dataset):
    label_combinations = Counter()
    idx_to_category = dataset.idx_to_category

    for category_tensor in dataset.categories:
        label_indices = tuple(category_tensor.nonzero(as_tuple=True)[0].cpu().numpy())
        if not label_indices:
            continue
        label_combinations[label_indices] += 1

    label_combinations_readable = {
        tuple(idx_to_category[idx] for idx in combination): count
        for combination, count in label_combinations.items()
    }

    return label_combinations_readable



label_combinations = count_label_combinations(dataset)

# Sort by count
label_combinations = {k: v for k, v in sorted(label_combinations.items(), key=lambda item: item[1], reverse=True)}
for combination, count in label_combinations.items():
    print(f"{combination}: {count}")


('inspirational',): 6139
('spiritual',): 6127
('love',): 5527
('knowledge',): 4009
('literary',): 3692
('wisdom',): 3174
('humor',): 1888
('time',): 1772
('death',): 1770
('science',): 1717
('truth',): 1557
('hope',): 1505
('happiness',): 1453
('philosophy',): 1389
('success',): 1303
('knowledge', 'inspirational'): 1058
('success', 'inspirational'): 591
('inspirational', 'spiritual'): 295
('hope', 'spiritual'): 223
('spiritual', 'science'): 177
('wisdom', 'spiritual'): 151
('wisdom', 'inspirational'): 149
('knowledge', 'love'): 137
('spiritual', 'philosophy'): 137
('inspirational', 'spiritual', 'happiness'): 136
('literary', 'inspirational'): 136
('time', 'inspirational'): 133
('love', 'inspirational'): 130
('literary', 'love'): 128
('love', 'humor'): 114
('knowledge', 'death'): 111
('hope', 'inspirational'): 106
('inspirational', 'happiness'): 102
('truth', 'spiritual'): 100
('truth', 'wisdom'): 97
('knowledge', 'spiritual'): 91
('wisdom', 'philosophy'): 90
('knowledge', 'wisdom'): 89