### Installations and imports

In [3]:
#!pip install transformers datasets
!pip install transformers datasets sentencepiece

from transformers import (pipeline, 
                          AutoModel, 
                          AutoTokenizer, 
                          AutoModelForSeq2SeqLM, 
                          AutoModelForCausalLM, 
                          DataCollatorForLanguageModeling, 
                          TrainingArguments, 
                          Trainer,
)

from datasets import load_dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Load model and dataset, extract questions

In [7]:
# Load pre-trained model from the HuggingFace Hub

#model_2_name = "microsoft/DialoGPT-medium"
model_2_name = "distilgpt2"

tokenizer_2 = AutoTokenizer.from_pretrained(model_2_name)
model_2 = AutoModelForCausalLM.from_pretrained(model_2_name)

# Set pad token to eos token
tokenizer_2.pad_token = tokenizer_2.eos_token

# LOAD DATASET
data_files = {"train": "sentences.csv"}
raw_datasets = load_dataset("csv", data_files=data_files)



  0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# REMOVE THE UNNECESSARY COLUMNS

useless_cols = ['label','word_count', 'mean_word_length', 'stop_words_ratio', 'stop_words_count', 'ADJ_count', 'ADV_count', 'ADP_count', 'AUX_count', 'DET_count', 'NUM_count', 'X_count', 'INTJ_count', 'CONJ_count', 'CCONJ_count', 'SCONJ_count', 'PROPN_count', 'NOUN_count', 'PRON_count', 'PART_count', 'VERB_count']
raw_datasets['train'] = raw_datasets['train'].remove_columns(useless_cols)

# Check dataset
#raw_datasets

# Check UNIQUE authors
raw_datasets['train'].unique('author')

['Kant', 'Hume', 'Plato', 'Aristotle', 'Nietzsche']

In [9]:
# EXTRACT QUESTIONS from a given author

questions = raw_datasets['train'].filter(lambda example: "?" in example['sentence'] and example['author'] == 'Nietzsche')

print(f"This dataset contains {questions.num_rows} questions. For example: ")
questions['sentence'][0:5]

  0%|          | 0/108 [00:00<?, ?ba/s]

This dataset contains 2412 questions. For example: 


['For, why is the triumph of Nihilism inevitable now?',
 'Nihilism is at our door: whence comes this most gruesome of all guests to us?--To begin with, it is a mistake to point to social evils, physiological degeneration, or even to corruption as a cause of Nihilism.',
 'Against purposelessness on the one hand, against moral valuations on the other: how far has all science and philosophy been cultivated heretofore under the influence of moral judgments?',
 'And have we not got the additional factor--the enmity of science, into the bargain?',
 'Or the prejudice against science?']

In [10]:
### Update dataset
raw_datasets['train'] = questions

### Tokenization

In [11]:
# TOKENIZATION

# Create function to tokenize the whole Dataset and prepares it for batch
def tokenize_function(example):
  return tokenizer_2(example["sentence"], truncation=True)

# Apply function to the Dataset and create batches of varying size (thus speeding up training)
# The tokenized dataset will create new columns ['input_ids', 'attention_mask'] 
# that will be used for training. We remove the columns not needed

column_names = ['sentence', 'author']

tokenized_datasets = raw_datasets.map(
    tokenize_function, 
    batched=True,
    num_proc=4,
    remove_columns=raw_datasets["train"].column_names
    )

        

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2412
    })
})

### Training the model (fine-tuning)

In [13]:
### TRAIN-TEST SPLIT

split_datasets = tokenized_datasets["train"].train_test_split(train_size=0.9, seed=42)

In [14]:
### TRAINING
tokenized_datasets = split_datasets

# Select DataCollator:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_2, mlm=False)

training_args = TrainingArguments(
    output_dir="./",
    overwrite_output_dir = 'True',
    num_train_epochs = 8,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy = 'no'
)

trainer = Trainer(
    model=model_2,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
)

In [15]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 2170
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2176
  Number of trainable parameters = 81912576
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,4.55136
2,4.573200,4.502758
3,4.573200,4.499616
4,4.229100,4.497686
5,4.229100,4.507396
6,4.081500,4.50798
7,4.081500,4.516804
8,3.977900,4.519958


***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8
***** Running Evaluation *****
  Num examples = 242
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./
Configuration saved in ./config.json
Model weights saved in ./pytorch_model.bin


### Question Generator

In [16]:
# GENERATE QUESTION WITH FINE-TUNED MODEL

# Load fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./") # loads the weights from current directory './'
tokenizer = tokenizer_2

# Helper: TRUNCATE strings to QUESTION mark
def questionTruncate(s: str):
  """
  String polisher for outputting clean questions.
  Input: string s
  Returns: string s truncated at the FIRST "?" char or at the LAST "." char
  """
  reverse = s[::-1]
  LastFullStop = reverse.find(".")
  FirstQuestMark = s.find("?")
  if FirstQuestMark >= 0 :
    return s[: FirstQuestMark +1 ]
  elif LastFullStop >= 0 :
    return s[: len(s) - LastFullStop]
  return s

# Test
#for s in ['Multiple? more than one?', 'First? Then no question.', 'No punctuation', 'No question.']:
#  print(questionTruncate(s))


# QUESTION GENERATOR

def questionGenerator(text: str):
  """
  Input:
  Returns:
  """
  # ENCODE input and add 'end-of-string' token
  input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
  l = len(input_ids)
  # GENERATE
  chat_history_ids = model.generate(
      input_ids,
      max_length=l+80,
      do_sample=True,
      top_p=0.91,
      top_k=10,
      temperature=0.75,
      early_stopping=True,
      pad_token_id=tokenizer.eos_token_id
  )

  # DECODE to string and Truncate
  output = tokenizer.decode(chat_history_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
  return output

loading configuration file ./config.json
Model config GPT2Config {
  "_name_or_path": "./",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformer

In [17]:
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What does Nihilism mean? 

Bot: A new, more modern and more modern world is about to dawn. What is the meaning of that term?The meaning of a philosopher is not just a matter of truth; it is a question of what meaning does it signify to you.In order to understand what is true and why, you must first understand why.What is the meaning of a philosopher?The 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is Pessimism 

Bot: What is the meaning of the word of the German god?A German god is a man who, after all, is a god, and thus is an ideal. But what does that mean?Because it is a question of what is the meaning of a god, and what is it that makes it so, that is to say, the most important thing to all the artists 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is the purpose? 

Bot: A new type of music has arrived: what is music?A new type of music is called music. But what is music?What is music?It is the meaning of the word music.It is a term that means to say to the world that music is a musical invention, a phenomenon that is not necessarily a natural or natural phenomenon, but a result of a combination 
 ----------------------------------------------------------------------------------------------------
Text: Prejudice against science? 

Bot: The following is a short story about the history of Christianity: why did God ever seek to rule over the world?What do I mean by that?... Why is it that the word Christian means the same thing to me as the meaning of a Christian god?...Why?Because of the fact that I am not a Christian, I am not a Christian. I am 
 ----------------------------------------------------------------------------------------------------


In [18]:
# Test
for text in ['What does Nihilism mean?', 'What is Pessimism', 'What is the purpose?', 'Prejudice against science?']:
  print(f"Text: {text} \n\nBot: {questionGenerator(text)} \n", 100*'-')

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What does Nihilism mean? 

Bot: This is the first question: What is the meaning of the word ‪--the word that is to be applied to the meaning of a person. This is the meaning of the word which is the word of the originator of the word: the meaning of a moral imperative.What is it that determines the meaning of a moral imperative?The meaning of the word 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is Pessimism 

Bot: I have always liked the word pessimism. What are they?The term pessimism is usually used for the goal of the ideal. What is it about this ideal?The meaning of the word pessimism is to be understood as an ideal for all things.What is the meaning of the word pessimism?It is to be understood as the meaning of the word pessimism 
 ----------------------------------------------------------------------------------------------------


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Text: What is the purpose? 

Bot: What is the meaning of the word, the meaning of the word, the meaning of the word, the meaning of the word?...What is the meaning of the word?...What is the meaning of the word?...What is the meaning of the word?...What is the meaning of the word?...What is the meaning of the word?...What is the 
 ----------------------------------------------------------------------------------------------------
Text: Prejudice against science? 

Bot: The German philosopher, who had been a pupil for many years, has made a great error in his interpretation of the meaning of the word, and has not quite understood it quite so much as I have already explained it.What is the meaning of the word?The term meaning, of course, comes from a Greek meaning, and is perhaps the most common form of the 
 ----------------------------------------------------------------------------------------------------
