In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# **Fine-tune the flan-t5-small model**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset


In [None]:
# Load the dataset
df = pd.read_excel('/content/Query Dataset.xlsx')

# Preprocess the dataset
df['Ayah'] = df['Ayah'].astype(str)
df = df.drop(columns=['Unnamed: 6', 'Topic', 'Keywords'], errors='ignore')
df.columns = df.columns.str.strip()
df = df.dropna()
df['target_text'] = df.apply(lambda x: f"Surah {x['Surah']}, Ayah {x['Ayah']}: {x['Quranic Verse']}", axis=1)
df = df[['Query', 'target_text']]
df = df.rename(columns={'Query': 'input_text'})

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = [f"Query: {query}" for query in examples['input_text']]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=256, truncation=True, padding="max_length")
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-quran",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./flan-t5-quran-finetuned")
tokenizer.save_pretrained("./flan-t5-quran-finetuned")

# Load model for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to generate responses
def generate_response(query, model, tokenizer, device="cuda"):
    input_text = f"Query: {query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True).to(device)
    outputs = model.generate(input_ids, max_length=256, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query
query = "What does the Quran say about patience?"
response = generate_response(query, model, tokenizer, device)
print(f"Query: {query}\nResponse: {response}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/727 [00:00<?, ? examples/s]



Map:   0%|          | 0/81 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,14.0457,7.749763
2,4.994,4.35019
3,3.882,3.389367
4,3.2011,2.694226
5,2.7154,2.157993
6,2.3415,1.776042
7,2.0873,1.512559
8,1.9109,1.33691
9,1.7643,1.246789
10,1.7612,1.215257


NameError: name 'torch' is not defined

In [None]:
import torch

In [None]:
# Load model for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to generate responses
def generate_response(query, model, tokenizer, device="cuda"):
    input_text = f"Query: {query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True).to(device)
    outputs = model.generate(input_ids, max_length=256, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query
query = "What does the Quran say about patience?"
response = generate_response(query, model, tokenizer, device)
print(f"Query: {query}\nResponse: {response}")

Query: What does the Quran say about patience?
Response: ness


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments
)
from datasets import Dataset

# Load the dataset
df = pd.read_excel('/content/Query Dataset.xlsx')

# Convert Ayah to string (if it's not already)
df['Ayah'] = df['Ayah'].astype(str)

# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 6', 'Topic', 'Keywords'], errors='ignore')

# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Ensure no missing values
df = df.dropna()

# Ensure correct formatting for target text
df['target_text'] = df.apply(lambda x: f"Surah {x['Surah']}, Ayah {x['Ayah']}: {x['Quranic Verse']}", axis=1)

# Keep only required columns
df = df[['Query', 'target_text']]
df = df.rename(columns={'Query': 'input_text'})

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")

# Adjust Tokenization Process
def preprocess_function(examples):
    inputs = [f"Query: {query}" for query in examples['input_text']]

    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length",
    )

    # Tokenizing targets correctly
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'],
            max_length=256,
            truncation=True,
            padding="max_length",
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Load model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-quran",
    evaluation_strategy="epoch",
    learning_rate=1e-4,  # Increased learning rate
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=8,  # Increased epochs for better learning
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./flan-t5-quran-finetuned")
tokenizer.save_pretrained("./flan-t5-quran-finetuned")

# Load model for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to generate responses
def generate_response(query, model, tokenizer, device="cuda"):
    input_text = f"Query: {query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True).to(device)

    outputs = model.generate(
        input_ids,
        max_length=256,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query
query = "What does the Quran say about patience?"
response = generate_response(query, model, tokenizer, device)
print(f"Query: {query}\nResponse: {response}")


Map:   0%|          | 0/727 [00:00<?, ? examples/s]



Map:   0%|          | 0/81 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,6.224,4.559031
2,3.1959,2.473404
3,1.8675,1.063887
4,0.9731,0.548174
5,0.6093,0.384113
6,0.468,0.325087
7,0.4234,0.304793
8,0.4023,0.299305


Query: What does the Quran say about patience?
Response: Surah 2, Ayah 63: And if you are patient, you will be patient.


**Flan-T5 Small was giving results like:
Query: What does the Quran say about patience?
Response: Surah 2, Ayah 63: And if you are patient, you will be patient ****.

.** **Because of this, I had to switch to Flan-T5 Base, but even this is giving only 90% accurate results**

# **Fine tune the flan-t5-base model**

This code fine-tunes the Flan-T5 base model on a Quranic dataset to generate Surah,Ayah and verses based on a query. It preprocesses the dataset, tokenizes inputs, and trains the model using Hugging Face's Trainer. After fine-tuning, it saves the model and provides an inference function to generate Quranic references for given queries

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments
)
from datasets import Dataset

# Load the dataset
df = pd.read_excel('/content/Query Dataset.xlsx')

# Convert Ayah to string (if it's not already)
df['Ayah'] = df['Ayah'].astype(str)

# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 6', 'Topic', 'Keywords'], errors='ignore')

# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Ensure no missing values
df = df.dropna()

# Ensure correct formatting for target text
df['target_text'] = df.apply(lambda x: f"Surah {x['Surah']}, Ayah {x['Ayah']}: {x['Quranic Verse']}", axis=1)

# Keep only required columns
df = df[['Query', 'target_text']]
df = df.rename(columns={'Query': 'input_text'})

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# Adjust Tokenization Process
def preprocess_function(examples):
    inputs = [f"Query: {query}" for query in examples['input_text']]

    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length",
    )

    # Tokenizing targets correctly
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['target_text'],
            max_length=256,
            truncation=True,
            padding="max_length",
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Load model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

# Training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-quran-base",
    evaluation_strategy="epoch",
    learning_rate=1e-4,  # Increased learning rate
    per_device_train_batch_size=8,  # Adjust batch size based on available memory
    per_device_eval_batch_size=8,
    num_train_epochs=8,  # Increased epochs for better learning
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()

# Save fine-tuned model
model.save_pretrained("./flan-t5-quran-base-finetuned")
tokenizer.save_pretrained("./flan-t5-quran-base-finetuned")

# Load model for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Function to generate responses
def generate_response(query, model, tokenizer, device="cuda"):
    input_text = f"Query: {query}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=256, truncation=True).to(device)

    outputs = model.generate(
        input_ids,
        max_length=256,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example query
query = "What does the Quran say about patience?"
response = generate_response(query, model, tokenizer, device)
print(f"Query: {query}\nResponse: {response}")


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Map:   0%|          | 0/727 [00:00<?, ? examples/s]



Map:   0%|          | 0/81 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3335,0.259796
2,0.235,0.202913
3,0.2001,0.174821
4,0.1827,0.158796
5,0.1663,0.149669
6,0.1534,0.144801
7,0.1535,0.143077
8,0.1383,0.142185


Query: What does the Quran say about patience?
Response: Surah 2, Ayah 261: And be patient, for indeed, Allah does not allow to be lost the reward of those who do good.
