<a href="https://colab.research.google.com/github/tararajagopalan/WebsiteQueryingProject/blob/main/GeneratingTextSummaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#using the google api using the command below:
from google.colab import userdata

#install api key from secretes and get it from there
#gemini api key is below
api_key = userdata.get('GOOGLE_API_KEY')

#DO NOT PRINT api_key: will leak it!

from google import genai
from google.genai import types
import pathlib
import httpx

#the gemini api is running on a server. the gemini api key is used to COMMUNICATE with this gemini api server.
#1) authentication: the gemini api key authenticates who you are to the server
#2) authorization: the gemini api key makes sure that you are allowed to call these functions
#client is using the gemini api_key
gemini_client = genai.Client(api_key=api_key)

In [None]:
#Function for getVideoSummary for Video Files

def getVideoSummary(VideoFilePath,client):
  # Only for videos of size <20Mb

  #we are passing in the video file path, reading the contents of the file into video bytes
  video_file_path =  VideoFilePath
  video_bytes = open(video_file_path, 'rb').read()

  #client: has gemini api key to access gemini api server and is a client for the GEMINI REST API where it can access gemini functions from the REST API
  #client is sending a request to the server (the REST API) calling the generate_content function (a function in the  Gemini API)
  #generate_content parameters: specifies model as gemini-2.5-flash,
    #inside of contents parameter, specifies type of file and send data via video bytes, and gives a prompt on what to do
  #inside the response parameter, it gets back the 3 sentence summary
  response = client.models.generate_content(
    model='models/gemini-2.5-flash',
    contents=types.Content(
        parts=[
            types.Part(
                inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')
            ),
            types.Part(text='Please summarize the video in 3 sentences.')
        ]
    )
 )
  return response.text




In [None]:
#Function for getPDFSummary for PDF Files

def getPDFSummary(PDFFilePath,client):

  #Retrieve and encode the PDF byte
  file_path = pathlib.Path(PDFFilePath)

  #Upload the pdf using the File API
  sample_file = client.files.upload(
    file = file_path,
  )

  prompt = "Summarize the Document"

  #client is sending a request to the REST API server and wants to use the generate_content function on the REST API SERVER
  #parameters: model (gemini-2.5-flash),contents has the pdf file to pass in and prompt of what we want to do
  response = client.models.generate_content(
      model = "gemini-2.5-flash",
      contents = [sample_file,"Summarize the Document"]
  )
  #response CONTAINS THE SUMMARY and by saying 'return response.text' we are saying to only return the text portion of the response
  return response.text




In [None]:
def getImageSummary(ImageFilePath, client, max_words=30):
    """
    Generates a short caption for an image.
    max_words = the maximum number of words in the returned caption.
    """

    # Read image bytes
    with open(ImageFilePath, 'rb') as f:
        image_bytes = f.read()

    prompt = f"Caption this image in under {max_words} words, concise and vivid."

    # Send request with a prompt that explicitly asks for brevity
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[
            types.Part.from_bytes(
                data=image_bytes,
                mime_type="image/jpeg"
            ),
            prompt
            #f"Caption this image in under {max_words} words, concise and vivid."
        ]
    )

    return response.text.strip()


In [None]:
def getTextSummary(txtFilePath, client):
    """
    Generates a short summary of a text file.
    """

    with open(txtFilePath, 'r') as f:
        text = f.read()
    prompt = "Summarize the Document"

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=[
            types.Part(text=text),
            prompt
        ]

    )

    return response.text


In [None]:
#Function of getfileSummary to get the summary for the designated file passed in
#calls other helper functions that were written above

def getFileSummary(FileType, FilePath,client):
  if FileType == '.mp4':
    return getVideoSummary(FilePath,client)
  elif FileType == '.pdf':
    return getPDFSummary(FilePath,client)
  elif FileType == '.jpg' or FileType == ".jpeg" or FileType == ".webp":
    return getImageSummary(FilePath,client)
  elif FileType == '.txt':
    return getTextSummary(FilePath,client)
  else:
    return "Invalid File Type"

In [None]:
#milvus installations are below:
!pip install -U pymilvus
!pip install --upgrade pymilvus
!pip install "pymilvus[model]"

Collecting pymilvus
  Downloading pymilvus-2.6.1-py3-none-any.whl.metadata (6.5 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Downloading pymilvus-2.6.1-py3-none-any.whl (254 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.3/254.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl (55.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.3/55.3 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson, milvus-

In [None]:
from pymilvus import model

In [None]:
#SETTING UP VECTOR DATA BASE: creating a database on the mounted google drive
#sets up data base in the project_work.db file on google drive

#declaring the client here and connecting it to project_work database!
from pymilvus import MilvusClient

milvus_client= MilvusClient("/content/drive/MyDrive/project_work.db")

collection_name = "project_collection"

In [None]:
import os
import time
def getInformationForEachFile(folderPath,gemini_client,milvus_client):

  folderPathArray = []




  for item_name in os.listdir(folderPath):
        item_path = os.path.join(folderPath, item_name)
        print("Processing: " + item_path)
        #item_path: full path to item (file or folder), combines folder and item_name

        if os.path.isfile(item_path):

            #print(f"File: {item_name}")
            _, extension = os.path.splitext(item_name)
            ext = extension.lower()
            #summary = getFileSummary(extension,item_path,gemini_client)

            if ext in [".jpeg",".webp",".jpg"]:
              summary = getImageSummary(item_path,gemini_client)
            elif ext in [".pdf"]:
              summary = getPDFSummary(item_path,gemini_client)
            elif ext in [".mp4",".mov"]:
              summary = getVideoSummary(item_path,gemini_client)
            elif ext in [".txt"]:
              summary = getTextSummary(item_path,gemini_client)
            else:
              summary = "Invalid File Type"

            embedding_fn = model.DefaultEmbeddingFunction()
            vectors = embedding_fn.encode_documents([summary])

            item_path = item_path.split("Static/",1)[1]

            folderPathArray.append({"text":summary,"embedding":vectors[0], "FilePath":item_path, "FileExtension":ext})


            #printing each summary
            #print(summary)
            #print("       ")
            #print(extension)
            #inside of here is where i would have to call my helper functions and generate text summaries for each of the files
            #Need: File Type and File Path
            #File Type:  _, extension = os.path.splitext(filename) -> pass in variable 'extension' to function
            #File Path:  item_path
            #print(extension)
            # You can perform operations on the file here, e.g., read its contents
            # with open(item_path, 'r') as f:
            #     content = f.read()
            #
        elif os.path.isdir(item_path):
            print(f"Directory: {item_name}")
            # You can recursively iterate into subdirectories if needed
            # For example: iterate_folder(item_path)
        time.sleep(5)  # Pause for 5 seconds
        print("5 seconds have passed.")
  return folderPathArray

In [None]:
#function to insert summaries into the client collection
def insertDataIntoCollection(folderpath, gemini_client, milvus_client, collection_name):

  data = getInformationForEachFile(folderpath,gemini_client,milvus_client)

  #gives you the summary of each document in folderpath with gemini API
  #docs = getSummaryEachFile(folderpath,gemini_client)

  #uses embedding function to generate embedding vectors from the docs passed in
  #embedding_fn = model.DefaultEmbeddingFunction()
  #vectors = embedding_fn.encode_documents(docs)

  #get the file extensions from each file in the folderpath; returns an array!
  #fileExtensions = getFileExtensions(folderpath,gemini_client)

  # get the file names for each file from the folder path; returns an array!
  #filePaths = getFilePaths(folderpath)

  #debug print statement
  #print(len(docs))
  #print(len(vectors))
  #print(len(filePaths))
  #print(len(fileExtensions))


  #creates the data with entities and fills in the schemas
  #data = [
    #{"text": docs[i], "embedding" : vectors[i], "FilePath": filePaths[i], "FileExtension":fileExtensions[i]}
    #for i in range(len(vectors)) #list comprehension syntax in python
  #]

  print(data)

  #uses milvus client to insert data into the appropriate collection for the project (project_collection)
  res = milvus_client.insert(collection_name=collection_name,data=data)
  print(res)


In [None]:
#testing insertDataIntoCollection function for IMAGE SUMMARIES: WORKS!!

folderPath = '/content/drive/My Drive/Flask/Static/Images/Samples'
insertDataIntoCollection(folderPath,gemini_client,milvus_client,collection_name)

Processing: /content/drive/My Drive/Flask/Static/Images/Samples/waterfall.jpg


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/46.9M [00:00<?, ?B/s]

5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/Images/Samples/unicorn.webp
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/Images/Samples/gymnast.jpeg
5 seconds have passed.
[{'text': 'Lush tropical forest frames a multi-tiered waterfall, its silky cascades flowing into a mesmerizing turquoise pool under dappled sunlight.', 'embedding': array([ 2.40950066e-02,  3.76025196e-02,  1.66131178e-02, -5.66196761e-02,
        2.70159096e-02, -1.49536412e-02,  4.50093862e-02, -5.39333461e-03,
       -1.62671305e-02, -2.03230173e-02, -3.07295709e-02, -1.65558138e-02,
        2.54192649e-02, -2.24423175e-02,  3.76957449e-02, -5.18630915e-02,
        4.90700145e-02,  4.18782152e-02, -2.49525841e-02, -1.19059918e-02,
       -3.42036217e-02,  3.63988817e-02,  1.87677331e-03, -2.85252024e-02,
        1.91415540e-03,  2.59623398e-02, -1.05928110e-02, -1.23879597e-01,
        2.84068005e-02, -1.00455689e-03, -3.83974991e-02, -6.82918360e-03,
        5.40

In [None]:
#testing insertDataIntoCollection function for PDF SUMMARIES: WORKS!!

folderPathPDF = '/content/drive/My Drive/Flask/Static/PDF/Samples'
insertDataIntoCollection(folderPathPDF,gemini_client,milvus_client,collection_name)

Processing: /content/drive/My Drive/Flask/Static/PDF/Samples/EasyRecipes.pdf
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/PDF/Samples/DogTrainingBasics.pdf
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/PDF/Samples/ClayMoldMaking.pdf
5 seconds have passed.
[{'text': 'This document is a collection of "Quick and easy recipes" specifically curated to encourage students and friends to cook at home, emphasizing that it\'s a much cheaper and healthier alternative to eating out, especially in places like Geneva.\n\nThe document features:\n*   An introductory section that motivates readers to start cooking, highlighting the financial benefits and the social aspect of cooking for roommates and friends.\n*   An index listing 14 distinct recipes, ranging from international dishes like Chili con Carne, Pad Thai Chicken, and Thai Green Curry, to simpler meals such as Omelette, Basic Pasta, and various salads.\n*   Individual pages for each recipe,

In [None]:
#testing insertDataIntoCollection function for Video Summaries: PENDING
folderPathVideos = '/content/drive/My Drive/Flask/Static/Videos/Samples'
insertDataIntoCollection(folderPathVideos,gemini_client,milvus_client,collection_name)

Processing: /content/drive/My Drive/Flask/Static/Videos/Samples/dance.mp4
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/Videos/Samples/video.mp4
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/Videos/Samples/House.mov
5 seconds have passed.
[{'text': 'The video features a young woman dancing outdoors, dressed in a black crop top with red stripes and black pants. She performs a series of fluid movements, turning and extending her arms. Her dance appears to be a blend of contemporary and hip-hop styles, set against the backdrop of a modern building and green trees.', 'embedding': array([ 0.07573756,  0.07831535, -0.03791755, -0.02734793,  0.04397979,
        0.00270617, -0.03425968, -0.02049076, -0.00242509,  0.02278836,
        0.06083866,  0.01924427, -0.03030838,  0.00335051,  0.01453841,
       -0.02840339,  0.02479852,  0.03000056,  0.0009456 ,  0.06231212,
       -0.04985516,  0.00202609, -0.03935996,  0.04348802,  0.01787262,
     

In [None]:
#testing insertDataIntoCollection function for Text Summaries: WORKS!!
folderPathText = '/content/drive/My Drive/Flask/Static/Text/Samples'
insertDataIntoCollection(folderPathText,gemini_client,milvus_client,collection_name)

Processing: /content/drive/My Drive/Flask/Static/Text/Samples/balck_panther.txt
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/Text/Samples/peter_rabbit.txt
5 seconds have passed.
Processing: /content/drive/My Drive/Flask/Static/Text/Samples/trouble_is_my_business.txt
5 seconds have passed.
[{'text': "After the events of *Captain America: Civil War*, King T'Challa returns to Wakanda to assume his role as the nation's new leader. However, he faces internal challenges for the throne and soon discovers two foes are conspiring to destroy Wakanda. As the Black Panther, he must team up with C.I.A. agent Everett K. Ross and the Dora Milaje (Wakandan special forces) to protect his country and prevent a world war.", 'embedding': array([-3.16361116e-02,  2.71868996e-02,  1.13317275e-02, -4.63726556e-02,
        3.12019521e-02,  3.60783849e-02,  4.08229731e-03, -5.72595737e-02,
       -4.65935217e-02, -5.62808367e-03, -1.27657396e-02,  2.94564904e-02,
        5.10595847e-