# Trying to understand how to use transformers yay

In [1]:
%pip install transformers datasets torch accelerate





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
import pandas as pd

df = pd.read_csv('./archive/casual_data_windows.csv')

print(df.head())


   Unnamed: 0                                                  0  \
0           0            What kind of phone(s) do you guys have?   
1           1  I have a pixel. It's pretty great. Much better...   
2           2       Does it really charge all the way in 15 min?   
3           3            What kind of phone(s) do you guys have?   
4           4  Samsung Galaxy J1. It's my first cell phone an...   

                                                   1  \
0  I have a pixel. It's pretty great. Much better...   
1       Does it really charge all the way in 15 min?   
2  Pretty fast. I've never timed it, but it's und...   
3  Samsung Galaxy J1. It's my first cell phone an...   
4  What do you think of it? Anything you don't like?   

                                                   2  
0       Does it really charge all the way in 15 min?  
1  Pretty fast. I've never timed it, but it's und...  
2  cool. I've been thinking of getting one, my ph...  
3  What do you think of it? Anythi

In [3]:
def format_conversation(row):
    if pd.notna(row['2']):
        return f"{row['0']} [SEP] {row['1']} [SEP] {row['2']}"
    else:
        return f"{row['0']} [SEP] {row['1']}"

df['formatted_text'] = df.apply(format_conversation, axis=1)

print(df['formatted_text'].head())


0    What kind of phone(s) do you guys have? [SEP] ...
1    I have a pixel. It's pretty great. Much better...
2    Does it really charge all the way in 15 min? [...
3    What kind of phone(s) do you guys have? [SEP] ...
4    Samsung Galaxy J1. It's my first cell phone an...
Name: formatted_text, dtype: object


In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer.add_special_tokens({'sep_token': '[SEP]'})
tokenizer.pad_token = tokenizer.eos_token

model.resize_token_embeddings(len(tokenizer))

def tokenize_function(examples):
    return tokenizer(examples['formatted_text'], return_special_tokens_mask=True, truncation=True, padding='max_length', max_length=512)

tokenized_datasets = df['formatted_text'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=512))


In [6]:
import torch
from torch.utils.data import Dataset

class ConversationDataset(Dataset):
    def __init__(self, tokenized_texts):
        self.input_ids = [torch.tensor(t['input_ids']) for t in tokenized_texts]
        self.attn_masks = [torch.tensor(t['attention_mask']) for t in tokenized_texts]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.input_ids[idx],  
        }


train_dataset = ConversationDataset(tokenized_datasets)


In [9]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none", 
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,  
)


trainer.train()




  0%|          | 0/42225 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:

trainer.evaluate()


In [None]:

model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dataset = {
    "hello": "Hello! How can I help you?",
    "how are you": "I'm just a bot, but thanks for asking! How about you?",
    "what's your name": "I'm an AI chatbot. You can call me Bot!",
    "what is the time": "I'm sorry, I can't tell time. But you can check your system clock!",
    "bye": "Goodbye! Have a nice day!"
}

vectorizer = TfidfVectorizer()

questions = list(dataset.keys())

X = vectorizer.fit_transform(questions)

def get_response(user_input):
    user_input_vector = vectorizer.transform([user_input])

    similarities = cosine_similarity(user_input_vector, X).flatten()

    best_match_index = np.argmax(similarities)

    if similarities[best_match_index] < 0.2:  
        return "I'm sorry, I didn't understand that. Could you rephrase?"
    
    # Return the best response from the dataset
    best_match_question = questions[best_match_index]
    return dataset[best_match_question]

def chatbot():
    print("Hello! I'm your AI chatbot. How can I help you today?")
    
    while True:
        user_input = input("You: ").lower()
        
        if user_input in ["bye", "exit", "quit"]:
            print("Chatbot: Goodbye! Have a great day!")
            break
        
        response = get_response(user_input)
        print(f"Chatbot: {response}")

if __name__ == "__main__":
    chatbot()


Hello! I'm your AI chatbot. How can I help you today?
Chatbot: I'm sorry, I didn't understand that. Could you rephrase?
Chatbot: I'm just a bot, but thanks for asking! How about you?
Chatbot: I'm sorry, I didn't understand that. Could you rephrase?
Chatbot: I'm sorry, I didn't understand that. Could you rephrase?


In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

data = pd.read_csv('./archive/casual_data_windows.csv')

data.head()
data['0'] = data['0'].fillna('')
data['1'] = data['1'].fillna('')

# Extract conversations
utterances_A = data['0'].values
utterances_B = data['1'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(utterances_A) + list(utterances_B))

utterances_A_seq = tokenizer.texts_to_sequences(utterances_A)
utterances_B_seq = tokenizer.texts_to_sequences(utterances_B)

max_length = max([len(seq) for seq in utterances_A_seq])
utterances_A_seq = pad_sequences(utterances_A_seq, maxlen=max_length, padding='post')
utterances_B_seq = pad_sequences(utterances_B_seq, maxlen=max_length, padding='post')

vocab_size = len(tokenizer.word_index) + 1

latent_dim = 256  
embedding_dim = 100 

encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

decoder_target_data = np.expand_dims(utterances_B_seq, -1)

model.fit([utterances_A_seq, utterances_B_seq], decoder_target_data, batch_size=64, epochs=1, validation_split=0.2)

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

def generate_response(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer.word_index['<start>']
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tokenizer.index_word[sampled_token_index]
        decoded_sentence += ' ' + sampled_word
        
        if sampled_word == '<end>' or len(decoded_sentence) > max_length:
            stop_condition = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h, c]
    
    return decoded_sentence

input_sentence = "Hello, how are you?"
input_sequence = tokenizer.texts_to_sequences([input_sentence])
input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding='post')
response = generate_response(input_sequence)
print("Chatbot response:", response)




[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1498s[0m 2s/step - loss: 1.8053 - val_loss: 0.7181
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step


KeyError: '<start>'

In [7]:
input_sentence = "Hello, how are you?"
input_sequence = tokenizer.texts_to_sequences([input_sentence])
input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding='post')
response = generate_response(input_sequence)
print("Chatbot response:", response)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


KeyError: '<start>'