<a href="https://colab.research.google.com/github/yasararafath007/Fine-tuned-Language-Model-for-Intelligent-Text-Generation-with-Deduplication/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradientai --upgrade

Collecting gradientai
  Downloading gradientai-1.7.0-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.4/270.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aenum>=3.1.11 (from gradientai)
  Downloading aenum-3.1.15-py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic<2.0.0,>=1.10.5 (from gradientai)
  Downloading pydantic-1.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: aenum, pydantic, gradientai
  Attempting uninstall: pydantic
    Found existing installation: pydantic 2.6.3
    Uninstalling pydantic-2.6.3:
      Successfully uninstalled pydantic-2.6.3
Successfully installed aenum-3.1.15 gradientai-1.7.0 pydantic-1.10.14


In [None]:
import os
os.environ['GRADIENT_ACCESS_TOKEN'] = "Fuf94WJbzvXflBMHAQeS7hS8XwEZ5GZA"
os.environ['GRADIENT_WORKSPACE_ID'] = "3baf6b9d-8965-4ecc-aad3-e9412408082d_workspace"

In [None]:
from gradientai import Gradient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def perform_cosine_similarity_deduplication(responses, threshold=0.95):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(responses)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Perform deduplication
    deduplicated_responses = []
    for i in range(len(responses)):
        if all(similarity_matrix[i, j] < threshold for j in range(len(responses)) if i != j):
            deduplicated_responses.append(responses[i])

    return deduplicated_responses

def main():
    with Gradient() as gradient:
        base_model = gradient.get_base_model(base_model_slug="nous-hermes2")

        new_model_adapter = base_model.create_model_adapter(
            name="test model 3"
        )
        print(f"Created model adapter with id {new_model_adapter.id}")
        sample_query = "### Instruction: What are the key features of the latest iPhone? \n\n### Response:"
        print(f"Asking: {sample_query}")

        # before fine-tuning
        completion_before = new_model_adapter.complete(query=sample_query, max_generated_token_count=100).generated_output
        print(f"Generated (before fine-tune): {completion_before}")

        samples = [
            { "inputs": "### Instruction: What are the key features of the latest iPhone? \n\n### Response: The latest iPhone comes with a high-resolution camera, Face ID, and a powerful A15 Bionic chip" },
            { "inputs": "### Instruction: List the specifications of the newest iPhone model. \n\n### Response: The newest iPhone boasts an advanced camera system, facial recognition technology, and a speedy A15 Bionic chip" },
            { "inputs": "### Instruction: Can you provide details about the features of the most recent iPhone release? \n\n### Response: The most recent iPhone release includes a top-notch camera, Face ID for enhanced security, and the latest A15 Bionic chip for improved performance" },
        ]

        # fine-tuning the model
        num_epochs = 3
        count = 0
        while count < num_epochs:
            print(f"Fine-tuning the model, iteration {count + 1}")
            new_model_adapter.fine_tune(samples=samples)
            count = count + 1

        # after fine-tuning
        completion_after = new_model_adapter.complete(query=sample_query, max_generated_token_count=100).generated_output
        print(f"Generated (after fine-tune): {completion_after}")

        # Check if the response meets the token count criteria
        if len(completion_after.split()) >= 100:
            # Perform data deduplication using cosine similarity
            responses = [completion_before, completion_after]  # Assuming you have a list of responses
            deduplicated_responses = perform_cosine_similarity_deduplication(responses, threshold=0.95)
            final_response = deduplicated_responses[0]  # Consider the first deduplicated response as the final one
            print(f"Generated (after fine-tune and deduplication): {final_response}")
        else:
            print("Generated response does not meet the token count criteria.")

        new_model_adapter.delete()

if __name__ == "__main__":
    main()


Created model adapter with id 6799c577-93f4-4959-a999-8452cb444116_model_adapter
Asking: ### Instruction: What are the key features of the latest iPhone? 

### Response:
Generated (before fine-tune):  The latest iPhone model, the iPhone 12, features a 6.1-inch Super Retina XDR display, A14 Bionic chip, 5G connectivity, a Ceramic Shield front cover for improved durability, and a dual-camera system with Night mode. It also has Face ID for secure authentication and comes in various colors.
Fine-tuning the model, iteration 1
Fine-tuning the model, iteration 2
Fine-tuning the model, iteration 3
Generated (after fine-tune):  The latest iPhone comes with a top-notch camera, facial recognition technology, and a speedy A15 Bionic chip.
Generated response does not meet the token count criteria.
