In [30]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms

import gensim
import gensim.downloader

print("PyTorch version:", torch.__version__)

PyTorch version: 2.0.1


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cpu


## Step 1: Load datasets

In [3]:
# Load dataset.
def load_dataset(dataset_path: str) -> pd.DataFrame:
    df = pd.read_csv(dataset_path)
    df.drop(columns=['id'], inplace=True) # Drop id column
    df.dropna(inplace=True) # Drop null values (if any)
    return df

In [4]:
# Load train data.
df_train = load_dataset(r'../data/cnn_dailymail/train.csv')
print("Number of records in training set:", len(df_train))

Number of records in training set: 287113


In [5]:
# Load validation data.
df_val = load_dataset(r'../data/cnn_dailymail/validation.csv')
print("Number of records in validation set:", len(df_val))

Number of records in validation set: 13368


In [6]:
# Load test data.
df_test = load_dataset(r'../data/cnn_dailymail/test.csv')
print("Number of records in test set:", len(df_test))

Number of records in test set: 11490


## Step 2: Data pre-processing

In [7]:
# Remove redundant newline character ('\n').
df_train['highlights'] = df_train['highlights'].str.replace('\n', ' ', regex=True)
# Remove the extra whitespace before the periods.
df_train['highlights'] = df_train['highlights'].str.replace(' \.','.', regex=False)

df_val['highlights'  ] = df_val['highlights'  ].str.replace('\n', ' ', regex=True)
df_val['highlights'  ] = df_val['highlights'  ].str.replace(' \.','.', regex=False)

df_test['highlights' ] = df_test['highlights' ].str.replace('\n', ' ', regex=True)
df_test['highlights' ] = df_test['highlights' ].str.replace(' \.','.', regex=False)

In [36]:
wv = gensim.downloader.load('glove-wiki-gigaword-200') # word vectors
vocab = np.array(wv.index_to_key)
embedding = np.array(wv.vectors)

pad_emb = np.zeros((1, embedding.shape[1]))         # embedding for '<pad>'.
unk_emb = np.mean(embedding, axis=0, keepdims=True) # embedding for '<unk>'.

vocab = np.insert(arr=vocab, obj=0, values='<pad>')
vocab = np.insert(arr=vocab, obj=0, values='<unk>')
embedding = np.vstack((pad_emb, unk_emb, embedding))



In [37]:
with open('./word_vectors/vocab.npy','wb') as f:
    np.save(f, vocab)

with open('./word_vectors/embedding.npy','wb') as f:
    np.save(f, embedding)

In [33]:
embedding_layer = nn.Embedding.from_pretrained(torch.from_numpy(embedding).float())