1. Load all the necessary functions

In [35]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from sklearn.model_selection import train_test_split
import requests
import torch.nn as nn
import torch.optim as optim

2. import CNN/Daily Mail dataset, from https://huggingface.co/datasets/cnn_dailymail

In [None]:
# follow the instruction from https://huggingface.co/docs/datasets-server/quick_start
# Ziyu Geng

API_URL = "https://datasets-server.huggingface.co/splits?dataset=cnn_dailymail"  # contains train, val, and test
def query():
    response = requests.get(API_URL)
    return response.json()
data = query()
print(data)

# train
API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=train&offset=0&limit=100"  # train
def train_query():
    response = requests.get(API_URL)
    return response.json()
train_data = train_query()
print(train_data)

# val
API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=validation&offset=0&limit=100"  # validation
def val_query():
    response = requests.get(API_URL)
    return response.json()
val_data = val_query()
print(val_data)

# test
API_URL = "https://datasets-server.huggingface.co/rows?dataset=cnn_dailymail&config=1.0.0&split=test&offset=0&limit=100"  # test
def test_query():
    response = requests.get(API_URL)
    return response.json()
test_data = test_query()
print(test_data)

In [None]:
####### this is the english version if we find an english version of the dataset
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

#1 tokenize text to individual words or subwords, remove stopwords, punctuation and special characters, and make them all lowercase

def preprocess_text(text):
    # tokenize text into words
    words = word_tokenize(text)

    # remove punctuation and special characters
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]

    # convert words to lowercase
    words = [word.lower() for word in words]

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # remove empty strings/single characters
    words = [word for word in words if len(word) > 1]

    return words

#2 pad/truncate text to make all samples have the same length

def pad_or_truncate_text(text, max_length):
    # preprocess the text
    preprocessed_text = preprocess_text(text)

    # truncate text if longer than max_length
    if len(preprocessed_text) > max_length:
        preprocessed_text = preprocessed_text[:max_length]

    # pad text if shorter than max_length
    elif len(preprocessed_text) < max_length:
        padding_length = max_length - len(preprocessed_text)
        preprocessed_text.extend(['<PAD>'] * padding_length)

    return preprocessed_text

max_length = 800  # <-----should be roughly how many words/tokens are in each article

#
for example in train_data['rows']:
    article_text = example['row']['article']
    preprocessed_text = pad_or_truncate_text(article_text, max_length)
    print(preprocessed_text)





In [None]:
#####this is the dutch version#############

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

#1 tokenize text to individual words or subwords, remove stopwords, punctuation and special characters, and make them all lowercase

def preprocess_text(text):
    # tokenize text into words
    words = word_tokenize(text, language='dutch')

    # remove punctuation and special characters
    table = str.maketrans('', '', string.punctuation)
    words = [word.translate(table) for word in words]

    # convert words to lowercase
    words = [word.lower() for word in words]

    # remove stopwords
    stop_words = set(stopwords.words('dutch'))
    words = [word for word in words if word not in stop_words]

    # remove empty strings/single characters
    words = [word for word in words if len(word) > 1]

    return words

#2 pad/truncate text to make all samples have the same length

def pad_or_truncate_text(text, max_length):
    # preprocess the text
    preprocessed_text = preprocess_text(text)

    # truncate text if longer than max_length
    if len(preprocessed_text) > max_length:
        preprocessed_text = preprocessed_text[:max_length]

    # pad text if shorter than max_length
    elif len(preprocessed_text) < max_length:
        padding_length = max_length - len(preprocessed_text)
        preprocessed_text.extend(['<PAD>'] * padding_length)

    return preprocessed_text

max_length = 800  # <-----should be roughly how many words/tokens are in each article

#
for example in train_data['rows']:
    article_text = example['row']['article']
    preprocessed_text = pad_or_truncate_text(article_text, max_length)
    print(preprocessed_text)





In [None]:
# Represent text as word embeddigs

# -Word emeddings examples are Word2Vec, GloVe, FastText and represent each word as dense vector
# Ziyu Geng, follow https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html.

# sorry guys, I did not find the Word2Vec pytorch material, so I did not use it, if you guys find something error, feel free to correct.

embedding_dim = 10 # same number from https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
torch.manual_seed(1)

# list of train texts
train_preprocessed_text = []
for example in train_data['rows']:
  article_text = example['row']['article']
  train_preprocessed_text.append(pad_or_truncate_text(article_text, max_length))

# build vocabulary
voc = set()
for example in train_data['rows']:
  article_text = example['row']['article']
  preprocessed_text = pad_or_truncate_text(article_text, max_length)
  for word in preprocessed_text:
      voc.add(word)

# make word to number
word_to_num = {}
i = 0
for word in voc:
    word_to_num[word] = i
    i += 1

class wordembedding(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(wordembedding, self).__init__()
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)

  def forward(self, inputs):
    return self.embeddings(inputs)

# Create the model and the optimizer
word_model = wordembedding(len(voc), embedding_dim)
optimizer = optim.SGD(word_model.parameters(), lr=0.001)

# loop train_preprocessed_text and generate embeddings
for sentence in train_preprocessed_text:
  sentence_idxs = []
  for word in sentence:
    id = word_to_num[word]  # Get the index of the word
    sentence_idxs.append(id)  # Append the index to the list

  sentence_idxs = torch.tensor(sentence_idxs, dtype=torch.long)  # Convert the list to tensor outside the loop
  word_model.zero_grad()
  embeddings = word_model(sentence_idxs)

  print(embeddings)


In [None]:
# define NN architecture

# some suggested are RNN, LSTM, or Transformer based models like BERT or GPT but im pretty sure we can use bert bc its a pre trained model?



In [None]:
# train model