GitHub Url: https://github.com/yashakhanna/NLPmodel

Video : https://drive.google.com/file/d/1vsoJNrIrVJY8AKQdU8CVQQoUzvwuW3jW/view?usp=sharing

In [23]:
import pandas as pd

# Define the project checklist data
checklist_data = {
    "Tasks and Comments": [
        "Preprocessing Steps",
        "Training - Model built, train and test",
        "Evaluation - ROUGE-L Score, BERT Score",
        "1st round of tuning",
        
        "Next steps Recommended"
    ],
    "Status": ["Done", "Done", "Done",  "Done", "Done"],
    "Individual Responsible": [
        "Rohit",
        "Rohit",
        "Rohit",
        "Akashdeep Choudhary",
        
        "Akashdeep Choudhary"
    ]
}

# Create a DataFrame
checklist_df = pd.DataFrame(checklist_data)

# Set the column name to the heading
checklist_df.columns.name = "Transfer learnt model built on a pretrained LLM such as GPT-2"

# Display the checklist DataFrame
print("Project Checklist")
print(checklist_df.to_string(index=False))


Project Checklist
                    Tasks and Comments Status Individual Responsible
                   Preprocessing Steps   Done                  Rohit
Training - Model built, train and test   Done                  Rohit
Evaluation - ROUGE-L Score, BERT Score   Done                  Rohit
                   1st round of tuning   Done    Akashdeep Choudhary
                Next steps Recommended   Done    Akashdeep Choudhary


In [2]:
!pip install faiss-cpu





In [3]:
import numpy as np
import pandas as pd
import re

# Lets Load the data
data = pd.read_csv("C:/Users/dhill/Downloads/Chat Data/Chat Data/chat_data.csv")
DF  = data.sample(frac=0.1, random_state=42) 

In [4]:
import warnings

def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()

with warnings.catch_warnings(action="ignore"):
    fxn()

## Main  code block

In [5]:
!pip install emoji nltk pandas faiss-cpu sentence-transformers transformers streamlit

import numpy as np
import pandas as pd
import re
import json
import emoji
import faiss
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline

# Downloading required NLTK data
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initializing lemmatizer and stop words...
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Loading the data and sample a fraction
data = pd.read_csv("C:/Users/dhill/Downloads/Chat Data/Chat Data/chat_data.csv")
df = data.sample(frac=0.1, random_state=42)
DF = df.copy()

# Correct formatting function...
def correct_formatting(json_str):
    if json_str == 'null':
        return None
    json_str = json_str.replace('"', "'")  
    json_str = re.sub(r"'(from|human|gpt|value)'", r'"\1"', json_str)  
    json_str = json_str.replace("\\'", "'")
    json_str = json_str.replace("}\n {", "},{")
    json_str = json_str.replace(": '", ': "')
    json_str = json_str.replace("'}", '"}')
    return json_str

# Parsing function with error handling....
def manual_parse(convo):
    if convo is None:
        return None
    try:
        parsed_convo = json.loads(convo)
        return parsed_convo
    except json.JSONDecodeError as e:
        return None

# Processing rows...
def process_row(row):
    formatted = correct_formatting(row['conversations'])
    parsed = manual_parse(formatted)
    return parsed

# Applying the function to each row and creating a new column for parsed conversations
DF['parsed_conversations'] = DF.apply(process_row, axis=1)

parsed_data = DF[DF['parsed_conversations'].notnull()].copy()
parsed_data = parsed_data.drop(columns=['conversations'])

# Extracting conversations into input-response pairs
def extract_conversations(conversations):
    input_texts = []
    response_texts = []
    for i in range(len(conversations) - 1):
        if conversations[i]['from'] == 'human' and conversations[i + 1]['from'] == 'gpt':
            input_texts.append(conversations[i]['value'])
            response_texts.append(conversations[i + 1]['value'])
    return input_texts, response_texts

parsed_data['input_texts'], parsed_data['response_texts'] = zip(*parsed_data['parsed_conversations'].apply(extract_conversations))

# Handling contractions
slangs_url = 'https://raw.githubusercontent.com/bodhwani/NLP-VIT-BOT/master/slangs.csv'
slangs = pd.read_csv(slangs_url)
slangs_dict = dict(zip(slangs['Abbr'], slangs['Fullform']))

def handle_negations(text):
    negation_words = set(["not", "no", "never", "none", "n't"])
    words = word_tokenize(text)
    new_words = []
    negate = False
    for word in words:
        if word in negation_words:
            negate = True
        elif negate:
            word = "NOT_" + word
            negate = False
        new_words.append(word)
    return ' '.join(new_words)

# Flattening the lists of lists...
parsed_data = parsed_data.explode(['input_texts', 'response_texts']).reset_index(drop=True)

# Further.... cleaning the text
def preprocess_text(text, apply_ner=False):
    if pd.isnull(text):
        return ""

    # converting to string....
    text = str(text)

    # lowercasing....
    text = text.lower()

    # replacing the emojis..
    text = emoji.demojize(text)

    # handling hashtags..
    text = re.sub(r'#(\w+)', r'\1', text)

    # removing mentions, URLs, and non-alphanumeric characters except for apostrophes..
    text = re.sub(r"@\S+|https?:\S+|http?:\S|[^A-Za-z0-9\s']", ' ', text)

    # removing extra spaces...
    text = re.sub(r'\s+', ' ', text).strip()

    # handling negations..
    text = handle_negations(text)

    # tokenization..
    words = word_tokenize(text)

    # replacing slang and abbreviations...
    words = [slangs_dict[word] if word in slangs_dict else word for word in words]

    # removing stop words and lemmatization....
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    return ' '.join(words)

# Applying further cleaning.... 
parsed_data['input_texts'] = parsed_data['input_texts'].apply(preprocess_text)
parsed_data['response_texts'] = parsed_data['response_texts'].apply(preprocess_text)

# Saving the cleaned data....
parsed_data.to_csv("chatbot_conversations.csv", index=False)

# Initializing the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Creating embeddings for the input texts
input_embeddings = model.encode(parsed_data['input_texts'].tolist())

# Creating FAISS index
index = faiss.IndexFlatL2(input_embeddings.shape[1])
index.add(input_embeddings)

# Saving the index
faiss.write_index(index, "chatbot_faiss.index")

# Loading the FAISS index
index = faiss.read_index("chatbot_faiss.index")

# Loading the input-response pairs
parsed_data = pd.read_csv("chatbot_conversations.csv")

# Function to find the most similar input text and its response.....
def find_response(user_input):
    user_embedding = model.encode([user_input])
    _, indices = index.search(user_embedding, k=1)
    response = parsed_data['response_texts'][indices[0][0]]
    return response

# Initializing the HuggingFace LLM....
generator = pipeline('text-generation', model='distilgpt2')

# Function to generate a response using RAG....
def generate_response(user_input):
    retrieved_response = find_response(user_input)
    input_text = f"User: {user_input}\nChatbot: {retrieved_response}"
    generated_response = generator(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']
    return generated_response




  from tqdm.autonotebook import tqdm, trange





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhill\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dhill\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhill\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:

# Example usage.....
user_input = "I'm feeling really sad lately."
response = generate_response(user_input)
print(response)



User: I'm feeling really sad lately.
Chatbot: 'm sorry hear 're feeling sad could tell 's going incredibly difficult time feeling let's find way make feel way way


# Lets do fine tuning

In [11]:
import pandas as pd

# Loading the cleaned data
parsed_data = pd.read_csv("chatbot_conversations.csv")

# Sample (10%) of the dataset...
parsed_data_sample = parsed_data.sample(frac=0.01, random_state=42)

# Preparing the dataset for training...
train_data = []
for input_text, response_text in zip(parsed_data_sample['input_texts'], parsed_data_sample['response_texts']):
    train_data.append(f"User: {input_text} Chatbot: {response_text}")

# Saving the prepared dataset...
with open("train_data.txt", "w") as f:
    for item in train_data:
        f.write("%s\n" % item)


In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Loading the dataset..
dataset = load_dataset('text', data_files={'train': 'train_data.txt'})

# Initializing the tokenizer...
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')

# Add a padding token if not present...
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenizing the dataset...
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Adding labels (same as input_ids) to the dataset
def add_labels(examples):
    examples['labels'] = examples['input_ids'].copy()
    return examples

tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)

# Updating model embeddings with the new padding token...
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/803 [00:00<?, ? examples/s]

Map:   0%|          | 0/803 [00:00<?, ? examples/s]

Embedding(50258, 768)

In [15]:
# Defining the training arguments with reduced epochs and batch size
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  
    per_device_train_batch_size=1,  
    save_steps=1000,  
    save_total_limit=2,
    fp16=True,  )

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
)

# Fine-tuning the model
trainer.train()


  0%|          | 0/803 [00:00<?, ?it/s]

{'loss': 0.6323, 'grad_norm': 1.158944010734558, 'learning_rate': 1.8866749688667497e-05, 'epoch': 0.62}
{'train_runtime': 1127.3474, 'train_samples_per_second': 0.712, 'train_steps_per_second': 0.712, 'train_loss': 0.5851014399736341, 'epoch': 1.0}


TrainOutput(global_step=803, training_loss=0.5851014399736341, metrics={'train_runtime': 1127.3474, 'train_samples_per_second': 0.712, 'train_steps_per_second': 0.712, 'total_flos': 104910645362688.0, 'train_loss': 0.5851014399736341, 'epoch': 1.0})

In [16]:
# Saving the model and tokenizer
model.save_pretrained("./fine-tuned-mod")
tokenizer.save_pretrained("./fine-tuned-mod")


('./fine-tuned-mod\\tokenizer_config.json',
 './fine-tuned-mod\\special_tokens_map.json',
 './fine-tuned-mod\\vocab.json',
 './fine-tuned-mod\\merges.txt',
 './fine-tuned-mod\\added_tokens.json')

# Testing


In [18]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./fine-tuned-mod")
model = GPT2LMHeadModel.from_pretrained("./fine-tuned-mod")
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Loading the FAISS index
index = faiss.read_index("chatbot_faiss.index")

# Loading the input-response pairs
parsed_data = pd.read_csv("chatbot_conversations.csv")

# Initializing the embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Function to find the most similar input text and its response
def find_response(user_input):
    user_embedding = embedder.encode([user_input])
    _, indices = index.search(user_embedding, k=1)
    response = parsed_data['response_texts'][indices[0][0]]
    return response

# Function to generate a response using RAG
def generate_response(user_input):
    retrieved_response = find_response(user_input)
    input_text = f"User: {user_input}\nChatbot: {retrieved_response}"
    generated_response = generator(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']
    return generated_response

# Defining question....
question = "I'm feeling really sad lately."

# Generating the response
response = generate_response(question)
print(f"Chatbot: {response}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Chatbot: User: I'm feeling really sad lately.
Chatbot: 'm sorry hear 're feeling sad could tell 's going wrong might help take care need help Chatbot: sound like feeling isolated moment make feel individual individual individual individual individual able tell individual


# Scores

In [16]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer, GPT2LMHeadModel, pipeline
from rouge_score import rouge_scorer
import bert_score

# Loading the fine-tuned model and tokenizer
evaluation_tokenizer = GPT2Tokenizer.from_pretrained("./fine-tuned-model4")
evaluation_model = GPT2LMHeadModel.from_pretrained("./fine-tuned-model4")
evaluation_generator = pipeline('text-generation', model=evaluation_model, tokenizer=evaluation_tokenizer)

# Loading the cleaned data
parsed_data_for_evaluation = pd.read_csv("chatbot_conversations.csv")

# Sample 10% of the dataset for evaluation....
evaluation_sample = parsed_data_for_evaluation.sample(frac=0.1, random_state=42)

eval_input_texts = evaluation_sample['input_texts'].tolist()
eval_reference_responses = evaluation_sample['response_texts'].tolist()

# Function to generate responses for evaluation....
def generate_responses_for_evaluation(model_pipeline, eval_input_texts, max_length=100):
    eval_responses = []
    for eval_text in eval_input_texts:
        generated_response = model_pipeline(eval_text, max_length=max_length, num_return_sequences=1)[0]['generated_text']
        eval_responses.append(generated_response)
    return eval_responses

# Functions to compute evaluation metrics.....
def calculate_rouge_l(predictions, references):
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = [rouge_scorer_instance.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(references, predictions)]
    average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)
    return average_rouge_l

def calculate_bert_score(predictions, references):
    P, R, F1 = bert_score.score(predictions, references, lang='en', rescale_with_baseline=True)
    avg_precision = P.mean().item()
    avg_recall = R.mean().item()
    avg_f1 = F1.mean().item()
    return avg_precision, avg_recall, avg_f1

# Generating responses for the evaluation inputs....
eval_model_responses = generate_responses_for_evaluation(evaluation_generator, eval_input_texts)

# Computing ROUGE-L score
rouge_l_score = calculate_rouge_l(eval_model_responses, eval_reference_responses)
print(f"ROUGE-L Score: {rouge_l_score:.4f}")

# Computing BERTScore
bert_precision, bert_recall, bert_f1 = calculate_bert_score(eval_model_responses, eval_reference_responses)
print(f"BERTScore - Precision: {bert_precision:.4f}, Recall: {bert_recall:.4f}, F1: {bert_f1:.4f}")

# Example interaction....
print("Example interaction:")
sample_input = "I'm feeling really sad lately. What should I do?"
generated_response = evaluation_generator(sample_input, max_length=100, num_return_sequences=1)[0]['generated_text']
print("User:", sample_input)
print("Chatbot:", generated_response)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: 

In [None]:
# Next step recomendation....
my model is interpreting the given input but cant able to to provide the efficient and point to context answer even after fine tuning so need to be trained again with more data for better performance and may need to do preprocessing again
NOTE : I printed the Rouge-L score and bert score already but while submitting i just press rerun and now i have to rerun the whole model and it will take more than 3 hours for whole model.


In [21]:
!jupyter nbconvert --to html nlp4.4.1.ipynb


[NbConvertApp] Converting notebook nlp4.4.1.ipynb to html
[NbConvertApp] Writing 678393 bytes to nlp4.4.1.html
