# Causal language modeling

Causal language models are frequently used for text generation. use these models for creative applications like choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot.

In [1]:
pip install transformers datasets evaluate accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
from kaggle_secrets import UserSecretsClient
huggingface_token = UserSecretsClient().get_secret("huggingface_token")

In [3]:
from huggingface_hub import login

login(token=huggingface_token)


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


**Load ELI5 dataset**

In [4]:
from datasets import load_dataset

eli5 = load_dataset("vishnun0027/eli5_dataset")
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls'],
        num_rows: 105004
    })
})

In [5]:
import pandas as pd
df = pd.DataFrame(eli5['train'])
df.head(5)


Unnamed: 0,q_id,title,selftext,category,subreddit,answers,title_urls,selftext_urls
0,5lchat,Why there was a 'leap second' added to the end...,,Other,explainlikeimfive,"{'a_id': ['dbuoyxl', 'dbur7gi', 'dbuotht'], 't...",[url],[url]
1,5lcjq6,How do you claim undiscovered land?,"If your on a boat, sailing through lets say th...",Other,explainlikeimfive,"{'a_id': ['dbuplm8', 'dbuocvb', 'dbux9vf'], 't...",[url],[url]
2,5lcl43,Why do we fail to do realistic human CGI (like...,"Title pretty much, thanks for answers in advance!",Technology,explainlikeimfive,"{'a_id': ['dbuns7l', 'dbunw2c', 'dbup34d', 'db...",[url],[url]
3,5lcr1h,Why is it that we calm down when we take a dee...,,Biology,explainlikeimfive,"{'a_id': ['dbuusst'], 'text': ['Anxiety/stress...",[url],[url]
4,5lcsyf,Why does 1080p on a 4k TV look better than 108...,,Technology,explainlikeimfive,"{'a_id': ['dbuq0qt', 'dbuqstj'], 'text': ['In ...",[url],[url]


In [6]:
category_counts = df['category'].value_counts()
category_counts

category
Biology          32769
Other            19312
Technology       14034
Physics          10196
Chemistry         6633
Economics         5901
Culture           5446
Engineering       5411
Repost            2375
Mathematics       1912
Earth Science      677
Psychology         338
Name: count, dtype: int64

In [7]:
import plotly.express as px

# Assuming you have already calculated category_counts using value_counts()

# Create a bar chart
fig = px.bar(x=category_counts.index, y=category_counts.values, labels={'x':'Category', 'y':'Count'})

# Update layout and set size
fig.update_layout(title='Count of Each Category', 
                  xaxis_title='Category', 
                  yaxis_title='Count',
                  width=600,  # Adjust width as needed
                  height=400  # Adjust height as needed
                 )

# Show the plot
fig.show()

**Load Technology dataset**

In [8]:
Technology_df = df[df['category'] == 'Technology']
Technology_df

Unnamed: 0,q_id,title,selftext,category,subreddit,answers,title_urls,selftext_urls
2,5lcl43,Why do we fail to do realistic human CGI (like...,"Title pretty much, thanks for answers in advance!",Technology,explainlikeimfive,"{'a_id': ['dbuns7l', 'dbunw2c', 'dbup34d', 'db...",[url],[url]
4,5lcsyf,Why does 1080p on a 4k TV look better than 108...,,Technology,explainlikeimfive,"{'a_id': ['dbuq0qt', 'dbuqstj'], 'text': ['In ...",[url],[url]
7,5ld14u,Why are internet speeds in America so slow,,Technology,explainlikeimfive,"{'a_id': ['dburxe8', 'dbursr8', 'dbuuxnd', 'db...",[url],[url]
12,5ldcnk,Why is ventilation needed for indoor or underg...,,Technology,explainlikeimfive,"{'a_id': ['dbuum0h'], 'text': ['If you were pl...",[url],[url]
21,5ledy7,How does Spotify and Apple Music pay artists?,,Technology,explainlikeimfive,"{'a_id': ['dbv646p'], 'text': ['The streaming ...",[url],[url]
...,...,...,...,...,...,...,...,...
91729,oacein,How was the first TV made?,"When I think about how TVs work now, it seems ...",Technology,explainlikeimfive,"{'a_id': ['h3gk0wf', 'h3gkp0m', 'h3glk0z', 'h3...",[url],[url]
91734,oagt8c,There are tons of phone messages (calls texts ...,,Technology,explainlikeimfive,"{'a_id': ['h3hs45l', 'h3hdoqf', 'h3hphac', 'h3...",[url],[url]
91736,oah68p,How can we stream HDTV through a service like ...,,Technology,explainlikeimfive,"{'a_id': ['h3hfv2y', 'h3hgbyx'], 'text': ['Whe...",[url],[url]
91742,oaktov,"In TV & movies, how do they sync up the animat...",This seems so hard. In reading stuff about The...,Technology,explainlikeimfive,"{'a_id': ['h3i3ukc'], 'text': ['Whoever is dir...",[url],[url]


In [9]:
from datasets import Dataset

Tech_dataset = Dataset.from_pandas(Technology_df)
Tech_dataset

Dataset({
    features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls', '__index_level_0__'],
    num_rows: 14034
})

In [10]:
Tech_dataset = Tech_dataset.train_test_split(test_size=0.2)
Tech_dataset

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls', '__index_level_0__'],
        num_rows: 11227
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers', 'title_urls', 'selftext_urls', '__index_level_0__'],
        num_rows: 2807
    })
})

In [11]:
Tech_dataset['train'][0]

{'q_id': 'fcejkh',
 'title': 'How do our phones know that we’re touching them and exactly where we touch them?',
 'selftext': '',
 'category': 'Technology',
 'subreddit': 'explainlikeimfive',
 'answers': {'a_id': ['fja93ki'],
  'score': [3],
  'text': ['Electrons repel other electrons. This really simple fact is brilliantly exploited by phones. By moving electrons into and out of pixels on the screen, our phones can detect the presence of nearby conductors. Something like metal makes it really easy to move electrons in and out of the screen. Something like rubber or air makes no difference. Flesh is somewhere between the two. The phone detects places where the electrons move in and out of the screen more easily *but not too easily* and knows that a finger or finger-like-thing must be touching the screen there.'],
  'text_urls': [[]]},
 'title_urls': ['url'],
 'selftext_urls': ['url'],
 '__index_level_0__': 67312}

**Preprocess**

*text field is actually nested inside answers need to extract the text subfield from its nested structure with the flatten method*

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [13]:
Tech_dataset = Tech_dataset.flatten()
Tech_dataset["train"][0]

{'q_id': 'fcejkh',
 'title': 'How do our phones know that we’re touching them and exactly where we touch them?',
 'selftext': '',
 'category': 'Technology',
 'subreddit': 'explainlikeimfive',
 'answers.a_id': ['fja93ki'],
 'answers.score': [3],
 'answers.text': ['Electrons repel other electrons. This really simple fact is brilliantly exploited by phones. By moving electrons into and out of pixels on the screen, our phones can detect the presence of nearby conductors. Something like metal makes it really easy to move electrons in and out of the screen. Something like rubber or air makes no difference. Flesh is somewhere between the two. The phone detects places where the electrons move in and out of the screen more easily *but not too easily* and knows that a finger or finger-like-thing must be touching the screen there.'],
 'answers.text_urls': [[]],
 'title_urls': ['url'],
 'selftext_urls': ['url'],
 '__index_level_0__': 67312}

In [14]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [15]:
tokenized_Tech_dataset = Tech_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=Tech_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/11227 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3553 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1272 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1446 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2480 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/2807 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1099 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1124 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3611 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1754 > 1024). Running this sequence through the model will result in indexing errors


In [16]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [17]:
lm_dataset = tokenized_Tech_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/11227 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2807 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

2024-04-21 08:29:19.301378: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 08:29:19.301433: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 08:29:19.302933: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**Train**

In [19]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [None]:
training_args = TrainingArguments(
    output_dir="vishnun0027/tech_clm-model_21042024",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    push_to_hub=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.push_to_hub()


Passing the following arguments to `Accelerator` is deprecated and will be removed in version 1.0 of Accelerate: dict_keys(['dispatch_batches', 'split_batches', 'even_batches', 'use_seedable_sampler']). Please pass an `accelerate.DataLoaderConfiguration` instead: 
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Epoch,Training Loss,Validation Loss



Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")