In [1]:
from langchain.llms import AzureOpenAI
import openai
from dotenv import load_dotenv
import os
from IPython.display import display, HTML
import json
import sklearn

In [2]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

OPENAI_DAVINCI_DEPLOYMENT_NAME = os.getenv("OPENAI_DAVINCI_DEPLOYMENT_NAME")
OPENAI_DAVINCI_MODEL_NAME = os.getenv("OPENAI_DAVINCI_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

### **Model Initialization**

In [3]:
def init_llm(model=OPENAI_MODEL_NAME,
             deployment_name=OPENAI_DEPLOYMENT_NAME, 
             temperature=0,
             max_tokens=3000,
             stop="<|im_end|>", 
             ):
    
    llm = AzureOpenAI(deployment_name=deployment_name,  
                  model=model,
                  temperature=temperature,) 
    return llm

In [4]:
#Loaders
from langchain.schema import Document

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Model
from langchain.chat_models import ChatOpenAI

# Embedding Support
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# Summarizer we'll use for Map Reduce
from langchain.chains.summarize import load_summarize_chain

# Data Science
import numpy as np
from sklearn.cluster import KMeans

In [5]:
llm=init_llm()
embeddings = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_MODEL_NAME)

In [6]:
from langchain.document_loaders import PyPDFLoader

large_pdf_path ="./data/NeurIPS-2020-language-models-are-few-shot-learners-Paper.pdf"
loader = PyPDFLoader(large_pdf_path)

text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=10000, chunk_overlap=3000)
pages = loader.load_and_split(text_splitter=text_splitter)

document_vectors=[]
for page in pages:
    document_vectors.append(embeddings.embed_documents([page.page_content], chunk_size=1)[0] )     

In [7]:
print("Document vectors shape: ", np.array(document_vectors).shape)

Document vectors shape:  (25, 1536)


In [8]:
print(document_vectors)

[[-0.012792802641945019, -0.0020704323326247945, 0.03693870520328509, -0.009268954560108558, 0.008645082976763786, 0.026586544238653066, 0.005666265829985531, 0.021074531972444195, -0.031892881361202684, -0.054818463549547565, 0.03515621336860416, 0.03194772881123656, -0.02751892294955116, -0.004795587240746471, 0.021622991571621463, 0.016549745867167297, 0.021540722259215838, 0.01023561339293912, 0.010859485907606483, -0.0049052788811851464, -0.006571222678511874, 0.005765673806373503, -0.015425405737763603, -0.005131518093316733, 0.01244316085818888, 0.020841436829058375, 0.032934953109523345, -0.02070432192926406, 0.004151147863638997, -0.0003372165143034164, 0.011826144740437045, -0.010263036186633465, -0.0012991619526043555, -0.001064352942820301, -0.002108138790369843, -0.022185162101984606, 0.012676256001574699, -0.008418843298970904, 0.011236551416379558, -0.006680914784611846, 0.019936479980532034, 0.03806304626401138, 0.01053041052062916, 0.0005694545789709641, 0.005573713319

In [9]:
num_clusters = 8

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(document_vectors)



In [10]:
kmeans.labels_

array([1, 1, 1, 4, 4, 4, 4, 1, 1, 1, 3, 3, 3, 1, 5, 5, 5, 7, 0, 2, 2, 2,
       2, 6, 6])

In [11]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Taking out the warnings
import warnings
from warnings import simplefilter

# Filter out FutureWarnings
simplefilter(action='ignore', category=FutureWarning)

# Perform t-SNE and reduce to 2 dimensions
tsne = TSNE(n_components=2, random_state=42)
reduced_data_tsne = tsne.fit_transform(np.array(document_vectors))

# Plot the reduced data
plt.scatter(reduced_data_tsne[:, 0], reduced_data_tsne[:, 1], c=kmeans.labels_)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Document Embeddings Clustered')
plt.show()

ValueError: perplexity must be less than n_samples

In [59]:
# Find the closest embeddings to the centroids

# Create an empty list for closest points
closest_indices = []

# Loop through the number of clusters 
for i in range(num_clusters):
    
    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(document_vectors - kmeans.cluster_centers_[i], axis=1)
    
    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index = np.argmin(distances)
    
    # Append that position to your closest indices list
    closest_indices.append(closest_index)

In [75]:
selected_indices = sorted(closest_indices)
selected_indices

[0, 4, 7, 13, 17, 21, 24, 28]

In [83]:
print(len(pages))
#for selectedidx in selected_indices:
#    print (pages[selectedidx].page_content)

#pages

#selected_pages = [pages[pageidx] for pageidx in selected_indices]
#print (selected_pages)
#print (len(selected_pages))

#selected_pages_content = ""
#selected_pages_content += [page.page_content for page in selected_pages]

25


In [68]:
#create summarization chain
summary_chain = load_summarize_chain(llm=llm, chain_type='stuff',verbose=False )

In [69]:
#run summarization chain on selected pages
sum = summary_chain.run(selected_pages)

display(HTML(sum))

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 5952 tokens (5696 in your prompt; 256 for the completion). Please reduce your prompt; or completion length.