In [103]:
import os
from dotenv import load_dotenv

import textwrap
import numpy as np
import pandas as pd

import google.ai.generativelanguage as glm
import google.generativeai as genai

from IPython.display import Markdown

In [104]:
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
# print(GOOGLE_API_KEY)
genai.configure(api_key=GOOGLE_API_KEY)

In [105]:
title = "The next generation of AI for developers and Google Workspace"
sample_text = ("Title: The next generation of AI for developers and Google Workspace"
    "\n"
    "Full article:\n"
    "\n"
    "Gemini API & Google AI Studio: An approachable way to explore and prototype with generative AI applications")

model = 'models/embedding-001'
embedding = genai.embed_content(model=model,
                                content=sample_text,
                                task_type="retrieval_document",
                                title=title)

print(embedding)

{'embedding': [0.03411343, -0.05517662, -0.020209055, -0.0041249567, 0.058917783, 0.014129515, 0.0045353593, 0.0014303668, 0.05976634, 0.08292115, 0.007162964, 0.0069041685, -0.053083427, -0.010905125, 0.0321402, -0.037163995, 0.050372455, 0.019348344, -0.037328612, 0.026647927, 0.030781753, -0.011288501, -0.031485256, -0.060248993, -0.026219442, -0.009794235, 0.006630139, -0.01846516, -0.026324715, 0.020442624, -0.06317684, 0.014559574, -0.052296035, 0.016451128, -9.720217e-05, -0.051706687, -0.0054406044, -0.056967627, 0.011144145, -0.009201792, -0.0021951047, -0.1099701, -0.011712193, 0.021221714, 0.009171804, -0.029621972, 0.034534883, 0.039578073, 0.019021519, -0.06269169, 0.039473332, 0.052403256, 0.061814185, -0.034507945, -0.009557816, -0.0049551064, 0.017839009, -0.021176832, 0.015043588, 0.015390569, -0.006334281, 0.043696404, -0.028341983, 0.028433999, 0.01472686, -0.06585564, -0.044533554, 0.0055523133, 0.035775978, 0.031099156, 0.027357662, 0.028062241, 0.056972917, -0.054

In [106]:
from pdfminer.high_level import extract_text

text = extract_text(r"E:\project_24\Questify - ragchain document analysis\source_documents\microsoft_annual_report_2022.pdf")
# split after every 5 sentences
sentences = text.split(".")
sentences = [" ".join(sentences[i:i + 5]).replace('\n', ' ').strip() for i in range(0, len(sentences), 10)]
for i in sentences:
    print(i)


Dear shareholders, colleagues, customers, and partners:    We are living through a period of historic economic, societal, and geopolitical change  The world in 2022 looks nothing like  the  world  in  2019   As  I  write  this,  inflation  is  at  a  40-year  high,  supply  chains  are  stretched,  and  the  war  in  Ukraine  is  ongoing   At  the  same  time,  we  are  entering  a  technological  era  with  the  potential  to  power  awesome  advancements  across  every  sector  of  our  economy  and  society   As  the  world’s  largest  software  company,  this  places  us  at  a  historic  intersection of opportunity and responsibility to the world around us
•  Peace Parks Foundation, a nonprofit helping protect natural ecosystems in Southern Africa, is using Microsoft  Dynamics 365 and Power BI to secure essential funding, as well as our Azure AI and IoT solutions to help  rangers scale their park maintenance and wildlife crime prevention work     •  One of the world’s largest robo

In [107]:

documents = []
for idx, i in enumerate(sentences):
    documents.append({
        'title': f"Document {idx}",
        'content': i
    })
documents

[{'title': 'Document 0',
  'content': 'Dear shareholders, colleagues, customers, and partners:    We are living through a period of historic economic, societal, and geopolitical change  The world in 2022 looks nothing like  the  world  in  2019   As  I  write  this,  inflation  is  at  a  40-year  high,  supply  chains  are  stretched,  and  the  war  in  Ukraine  is  ongoing   At  the  same  time,  we  are  entering  a  technological  era  with  the  potential  to  power  awesome  advancements  across  every  sector  of  our  economy  and  society   As  the  world’s  largest  software  company,  this  places  us  at  a  historic  intersection of opportunity and responsibility to the world around us'},
 {'title': 'Document 1',
  'content': '•  Peace Parks Foundation, a nonprofit helping protect natural ecosystems in Southern Africa, is using Microsoft  Dynamics 365 and Power BI to secure essential funding, as well as our Azure AI and IoT solutions to help  rangers scale their park main

In [108]:
df = pd.DataFrame(documents)
df.columns = ['Title', 'Text']
df

Unnamed: 0,Title,Text
0,Document 0,"Dear shareholders, colleagues, customers, and ..."
1,Document 1,"• Peace Parks Foundation, a nonprofit helping..."
2,Document 2,There is no more powerful input than digital ...
3,Document 3,"To help address this, we’ve committed to skill..."
4,Document 4,Building on our work in eight US cities...
...,...,...
190,Document 190,Compensation Committee 3 Governance and No...
191,Document 191,m to 5:00 p m Pacific Time to answer ...
192,Document 192,com Our mailing address is: Investor Rela...
193,Document 193,You can e-mail the transfer agent at: web qu...


In [109]:

# Get the embeddings of each text and add to an embeddings column in the dataframe
def embed_fn(title, text):
    return genai.embed_content(
            model=model,
            content=text,
            task_type="retrieval_document",
            title=title
        )["embedding"]

df['Embeddings'] = df.apply(lambda row: embed_fn(row['Title'], row['Text']), axis=1)
df

Unnamed: 0,Title,Text,Embeddings
0,Document 0,"Dear shareholders, colleagues, customers, and ...","[0.011256746, -0.0029021315, -0.029392434, 0.0..."
1,Document 1,"• Peace Parks Foundation, a nonprofit helping...","[0.014037977, -0.020081276, -0.032478932, 0.01..."
2,Document 2,There is no more powerful input than digital ...,"[0.045135427, -0.043423183, -0.033143293, 0.02..."
3,Document 3,"To help address this, we’ve committed to skill...","[0.035125438, -0.001972038, -0.05628028, -0.03..."
4,Document 4,Building on our work in eight US cities...,"[0.049262702, -0.038013034, -0.044156432, 0.02..."
...,...,...,...
190,Document 190,Compensation Committee 3 Governance and No...,"[0.013496382, -0.012596303, -0.038715817, -0.0..."
191,Document 191,m to 5:00 p m Pacific Time to answer ...,"[0.0030898922, -0.023069674, -0.024056338, -0...."
192,Document 192,com Our mailing address is: Investor Rela...,"[-0.010960111, -0.015149452, -0.029125702, -0...."
193,Document 193,You can e-mail the transfer agent at: web qu...,"[0.027035734, -0.040730845, -0.054808687, -0.0..."


In [117]:
query = "Responsibity at microsoft"
model = 'models/embedding-001'

request = genai.embed_content(
            model=model,
            content=query,
            task_type="retrieval_query"
        )
request

{'embedding': [-0.0063783787,
  -0.049423806,
  0.01714772,
  0.005949993,
  0.08749973,
  0.017255899,
  0.0025536013,
  -0.015186562,
  0.014139231,
  0.00030590838,
  0.022753365,
  0.008589634,
  -0.03704904,
  -0.021229839,
  0.042823378,
  -0.06417158,
  0.03915926,
  -0.032416187,
  -0.005359891,
  -0.017482359,
  0.011968497,
  0.009087496,
  -0.007511241,
  -0.028694699,
  0.03440794,
  -0.019034235,
  0.0084989425,
  -0.13119876,
  -0.010243938,
  0.06733665,
  -0.026992453,
  0.047242824,
  -0.025630435,
  0.027072959,
  0.016269362,
  0.0075809555,
  -0.019721113,
  -0.008568332,
  -0.020895299,
  0.04750047,
  -0.015357976,
  -0.025843618,
  -0.051981688,
  -0.013122869,
  0.05727723,
  -0.0135542685,
  0.012134197,
  0.04128019,
  0.015832156,
  -0.023008138,
  0.029366786,
  0.026330762,
  0.028975848,
  -0.020295165,
  -0.039218705,
  -0.03090284,
  0.010300878,
  0.036091954,
  0.00577039,
  -0.018480243,
  0.015998913,
  0.021205297,
  -0.010627862,
  0.03150227,
  -0

In [118]:
def find_best_passage(query, dataframe):
    """
    Compute the distances between the query and each document in the dataframe
    using the dot product.
    """
    query_embedding = genai.embed_content(model=model,
                            content=query,
                            task_type="retrieval_query")
    dot_products = np.dot(np.stack(dataframe['Embeddings']), query_embedding["embedding"])
    idx = np.argmax(dot_products)
    return dataframe.iloc[idx]['Text'] # Return text from index with max value

In [119]:
passage = find_best_passage(query, df)
passage

'To  meet  the  expectations  of  our  stakeholders  and  to  and  maintain  their  trust,  we  are  committed  to  conducting  our  business  in  ways  that  are  principled,  transparent,  and  accountable  and  we  have  made  a  broad  range  of  environmental  and  social  commitments   From  our  CEO  and  Senior  Leadership  Team  and  throughout  our  organization,  people  at  Microsoft  are  working to conduct our business in principled ways that make  a  significant  positive  impact  on  important  global  issues   Microsoft’s Board of Directors provides insight, feedback, and  oversight  across  a  broad  range  of  environmental  and  social  matters   In  particular,  among  the  responsibilities  of  the  Board’s  Environmental,  Social,  and  Public  Policy  Committee  is  to  review  and  provide  guidance  to  the  Board  and  management  about  the  Company’s  policies  and  programs  that relate to corporate social responsibility     For  more  about  Microsoft’s  

In [120]:
def make_prompt(query, relevant_passage):
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'
    ANSWER:
    """).format(query=query, relevant_passage=escaped)

    return prompt

In [121]:
prompt = make_prompt(query, passage)
print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.     Be sure to respond in a complete sentence, being comprehensive, including all relevant background information.     However, you are talking to a non-technical audience, so be sure to break down complicated concepts and     strike a friendly and converstional tone.     If the passage is irrelevant to the answer, you may ignore it.
    QUESTION: 'Responsibity at microsoft'
    PASSAGE: 'To  meet  the  expectations  of  our  stakeholders  and  to  and  maintain  their  trust,  we  are  committed  to  conducting  our  business  in  ways  that  are  principled,  transparent,  and  accountable  and  we  have  made  a  broad  range  of  environmental  and  social  commitments   From  our  CEO  and  Senior  Leadership  Team  and  throughout  our  organization,  people  at  Microsoft  are  working to conduct our business in principled ways that make  a  significant  positive  i

In [122]:

# for m in genai.list_models():
#     if 'generateContent' in m.supported_generation_methods:
#         print(m.name)

In [123]:
model = genai.GenerativeModel('models/gemini-1.5-flash-latest')
answer = model.generate_content(prompt)
Markdown(answer.text)

Microsoft is committed to being responsible and doing business in a way that is ethical, transparent, and accountable. They take environmental and social issues seriously, and the company’s Board of Directors has a special committee focused on these issues. This committee reviews Microsoft’s policies and programs to ensure they are aligned with their corporate social responsibility goals. 
