In [1]:
# mounting drive
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

# turning of warnings
import warnings
warnings.filterwarnings("ignore")
import pprint
import os

# changing working directory to repositories
os.chdir('/content/drive/MyDrive/repositories/Generative-AI')

from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

Mounted at /content/drive


In [2]:
# !pip install transformers torch datasets

In [3]:
!pip install py7zr



# Loading Datasets From Hugging Face

In [4]:
from datasets import load_dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")
dataset

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 203
    })
})

In [5]:
dataset['train'][9]

{'act': 'Travel Guide',
 'prompt': 'I want you to act as a travel guide. I will write you my location and you will suggest a place to visit near my location. In some cases, I will also give you the type of places I will visit. You will also suggest me places of similar type that are close to my first location. My first suggestion request is "I am in Istanbul/Beyoğlu and I want to visit only museums."'}

In [7]:
from datasets import load_dataset

# Data Preprocessing Methods

In [8]:
# Reload GPT Prompt dataset (Cache keeps from loading it twice)
dataset = load_dataset("fka/awesome-chatgpt-prompts")
dataset

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 203
    })
})

In [9]:
# print an example
dataset['train'][0]

{'act': 'An Ethereum Developer',
 'prompt': 'Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.'}

In [10]:
# shuffle & sample
dataset = dataset['train'].shuffle(seed=37).select(range(100))
dataset

Dataset({
    features: ['act', 'prompt'],
    num_rows: 100
})

In [11]:
# Create Test Dataset
dataset = dataset.train_test_split(train_size=0.8, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 80
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 20
    })
})

# Creating Your Own Dataset

In [12]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz

--2025-09-28 11:46:04--  https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘reuters21578.tar.gz’

reuters21578.tar.gz     [   <=>              ]   7.77M  14.9MB/s    in 0.5s    

2025-09-28 11:46:05 (14.9 MB/s) - ‘reuters21578.tar.gz’ saved [8150596]



In [13]:
!tar -xzvf reuters21578.tar.gz

README.txt
all-exchanges-strings.lc.txt
all-orgs-strings.lc.txt
all-people-strings.lc.txt
all-places-strings.lc.txt
all-topics-strings.lc.txt
cat-descriptions_120396.txt
feldman-cia-worldfactbook-data.txt
lewis.dtd
reut2-000.sgm
reut2-001.sgm
reut2-002.sgm
reut2-003.sgm
reut2-004.sgm
reut2-005.sgm
reut2-006.sgm
reut2-007.sgm
reut2-008.sgm
reut2-009.sgm
reut2-010.sgm
reut2-011.sgm
reut2-012.sgm
reut2-013.sgm
reut2-014.sgm
reut2-015.sgm
reut2-016.sgm
reut2-017.sgm
reut2-018.sgm
reut2-019.sgm
reut2-020.sgm
reut2-021.sgm


In [16]:
# the sgm files are what contains the articles
from bs4 import BeautifulSoup

# Open the file and parse its content with BeautifulSoup
reuters_articles = []
for i in range(22):
    if i < 10:
        i = f"0{i}"
    # load file data
    filename = f"/content/drive/MyDrive/repositories/Generative-AI/reut2-0{i}.sgm"
    with open(filename, 'r'  encoding='latin-1') as file:
        soup = BeautifulSoup(file, "html.parser")

    # Extract articles' titles and bodies
    articles = []
    for reuters in soup.find_all('reuters'):
        title = reuters.title.string if reuters.title else ""
        body = reuters.body.string if reuters.body else ""
        articles.append({
              'title': title,
              'body': body
          })
    reuters_articles.extend(articles)


In [21]:
type(reuters_articles),type(reuters_articles[0])

(list, dict)

In [23]:
import pprint
pprint.pprint(reuters_articles[0])

{'body': 'Showers continued throughout the week in\n'
         'the Bahia cocoa zone, alleviating the drought since early\n'
         'January and improving prospects for the coming temporao,\n'
         'although normal humidity levels have not been restored,\n'
         'Comissaria Smith said in its weekly review.\n'
         '    The dry period means the temporao will be late this year.\n'
         '    Arrivals for the week ended February 22 were 155,221 bags\n'
         'of 60 kilos making a cumulative total for the season of 5.93\n'
         'mln against 5.81 at the same stage last year. Again it seems\n'
         'that cocoa delivered earlier on consignment was included in the\n'
         'arrivals figures.\n'
         '    Comissaria Smith said there is still some doubt as to how\n'
         'much old crop cocoa is still available as harvesting has\n'
         'practically come to an end. With total Bahia crop estimates\n'
         'around 6.4 mln bags and sales standing at alm

In [24]:
# Print out the first few articles for inspection
for i, article in enumerate(reuters_articles[:5]):
  print(article)
  print("-"*100)

{'title': 'BAHIA COCOA REVIEW', 'body': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are do

In [25]:
len(reuters_articles)

21578

In [26]:
import json

TRAIN_PCT, VALID_PCT = 0.8, 0.1

# Split the data
train_articles = reuters_articles[:int(len(reuters_articles)*TRAIN_PCT)]
valid_articles = reuters_articles[int(len(reuters_articles)*TRAIN_PCT): int(len(reuters_articles)*(TRAIN_PCT + VALID_PCT))]
test_articles = reuters_articles[int(len(reuters_articles)*(TRAIN_PCT + VALID_PCT)):]

# Function to save articles as JSONL
def save_as_jsonl(data, filename):
    with open(filename, "w") as f:
        for article in data:
            f.write(json.dumps(article) + "\n")

# Save them into temporary JSONL files
save_as_jsonl(train_articles, "train.jsonl")
save_as_jsonl(valid_articles, "valid.jsonl")
save_as_jsonl(test_articles, "test.jsonl")

In [27]:
# Load them as datasets
data_files = {"train": "train.jsonl", "validation": "valid.jsonl", "test": "test.jsonl"}
dataset = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [29]:
dataset['train'][0]

{'title': 'BAHIA COCOA REVIEW',
 'body': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are d

# Upload Dataset to Hub

In [30]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'body'],
        num_rows: 17262
    })
    validation: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
    test: Dataset({
        features: ['title', 'body'],
        num_rows: 2158
    })
})

In [32]:
dataset.push_to_hub("reuters_articles")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :  95%|#########5| 7.74MB / 8.11MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              : 100%|##########| 1.11MB / 1.11MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              : 100%|##########|  826kB /  826kB            

CommitInfo(commit_url='https://huggingface.co/datasets/thomaskutty13/reuters_articles/commit/9fbcb81f7d808be5375250a9639f1704ea6f27b7', commit_message='Upload dataset', commit_description='', oid='9fbcb81f7d808be5375250a9639f1704ea6f27b7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thomaskutty13/reuters_articles', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thomaskutty13/reuters_articles'), pr_revision=None, pr_num=None)