# Causal language modeling

Causal language models are frequently used for text generation. use these models for creative applications like choosing your own text adventure or an intelligent coding assistant like Copilot or CodeParrot.

In [None]:
pip install transformers datasets evaluate accelerate

In [None]:
from kaggle_secrets import UserSecretsClient
huggingface_token = UserSecretsClient().get_secret("huggingface_token")

In [None]:
from huggingface_hub import login

login(token=huggingface_token)


**Load ELI5 dataset**

In [None]:
from datasets import load_dataset

eli5 = load_dataset("vishnun0027/eli5_dataset")
eli5

In [None]:
import pandas as pd
df = pd.DataFrame(eli5['train'])
df.head(5)


In [None]:
category_counts = df['category'].value_counts()
category_counts

In [None]:
import plotly.express as px

# Assuming you have already calculated category_counts using value_counts()

# Create a bar chart
fig = px.bar(x=category_counts.index, y=category_counts.values, labels={'x':'Category', 'y':'Count'})

# Update layout and set size
fig.update_layout(title='Count of Each Category', 
                  xaxis_title='Category', 
                  yaxis_title='Count',
                  width=600,  # Adjust width as needed
                  height=400  # Adjust height as needed
                 )

# Show the plot
fig.show()

**Load Technology dataset**

In [None]:
Technology_df = df[df['category'] == 'Technology']
Technology_df

In [None]:
from datasets import Dataset

Tech_dataset = Dataset.from_pandas(Technology_df)
Tech_dataset

In [None]:
Tech_dataset = Tech_dataset.train_test_split(test_size=0.2)
Tech_dataset

In [None]:
Tech_dataset['train'][0]

**Preprocess**

*text field is actually nested inside answers need to extract the text subfield from its nested structure with the flatten method*

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

In [None]:
Tech_dataset = Tech_dataset.flatten()
Tech_dataset["train"][0]

In [None]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
tokenized_Tech_dataset = Tech_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=Tech_dataset["train"].column_names,
)

In [None]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_Tech_dataset.map(group_texts, batched=True, num_proc=4)

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

**Train**

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [None]:
training_args = TrainingArguments(
    output_dir="vishnun0027/tech_clm-model_21042024",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    push_to_hub=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.push_to_hub()

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")