In [1]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer, AdamWeightDecay, pipeline, create_optimizer
from transformers import DefaultDataCollator
import tensorflow as tf
from datasets import Dataset, DatasetDict, load_dataset
import plotly.express as px
import plotly.io as pio
import pandas as pd
import math
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pio.renderers.default = 'notebook_connected'

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2", pad_token_id=tokenizer.eos_token_id)

In [None]:
data = load_dataset("CShorten/ML-ArXiv-Papers", split='train')
data

In [None]:
data = data.train_test_split(shuffle = True, seed = 200, test_size=0.2)

train = data["train"]
val = data["test"]

In [None]:
# The tokenization function
def tokenization(data):
    tokens = tokenizer(data["abstract"], padding="max_length", truncation=True, max_length=300)
    return tokens

# Apply the tokenizer in batch mode and drop all the columns except the tokenization result
train_token = train.map(tokenization, batched = True, remove_columns=["title", "abstract", "Unnamed: 0", "Unnamed: 0.1"], num_proc=10)
val_token = val.map(tokenization, batched = True, remove_columns=["title", "abstract", "Unnamed: 0", "Unnamed: 0.1"], num_proc=10)

In [None]:
# Create labels as a copy of input_ids
def create_labels(text):
    text["labels"] = text["input_ids"].copy()
    return text

# Add the labels column using map()
lm_train = train_token.map(create_labels, batched=True, num_proc=10)
lm_val = val_token.map(create_labels, batched=True, num_proc=10)