In [1]:
import pandas as pd
import os
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from nltk.tokenize import sent_tokenize
import faiss

from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain.embeddings import AlephAlphaAsymmetricSemanticEmbedding
from langchain.embeddings import GPT4AllEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [17]:
df = pd.read_csv("../data/genai_poc/processed/text_extracted_sectional_data_25_10_2023.csv")

In [18]:
df

Unnamed: 0,doc_id,page,section_id,sections
0,0,1,0,Abstract: The fracture behavior of the Cu/Sn-3...
1,0,1,5,The failure of a solder joint is a problem tha...
2,0,2,0,in their mechanical properties being different...
3,0,2,1,Existing studies have conducted extensive nume...
4,0,2,2,"Currently, many related studies have been carr..."
...,...,...,...,...
519,6,10,4,Figure 15. The Coffin Manson model of SAC305 s...
520,6,11,0,Figure 16. The Coffin Manson equation at diffe...
521,6,11,2,Figure 17. The prediction models for the coffi...
522,6,11,3,when the Arrhenius model is applied. By utiliz...


In [3]:
class Embedding:
    
    def __init__(self, embedding_model = None):
        
        if embedding_model is None:
            raise ValueError("Please pass a valid Langchain embedding model ")
        self.embedding_model = embedding_model
        
    def create_embedding_text(self, text: str = None):
        
        if text is None:
            raise ValueError("Please provide text for which you need to create embedding")
            
        embedded_vector = self.embedding_model.embed_query(text)
        return embedded_vector
    
    def create_embedding_document(self, list_of_sentences: list = None):
        
        if list_of_sentences is None:
            raise ValueError("Please provide a valid list of sentences for which you need to create embedding")
        
        embedded_vector_docs = self.embedding_model.embed_documents(list_of_sentences)
        return embedded_vector_docs

In [7]:
from langchain.document_loaders import TextLoader, PDFPlumberLoader

In [9]:
loader = PDFPlumberLoader('../data/genai_poc/raw/coefficient extraction of sac305 using equation informed NNs.pdf')
documents = loader.load()

In [10]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [16]:
print(docs[1].page_content)

Materials2023,16,4922 2of16
Chen[16],Deshpandeetal.[17],Wangetal.[5],andYanetal.[18]forlifepredictionusing
Manson–Coffintypeequations.
MaandSuhlingreviewedtheconstitutiveequationandthecorrespondingcoefficients
oflead-freesolderjoint[19],andsignificantcoefficientdiscrepancieshavebeenreported.
Ontheotherhand,finiteelementengineersfrequentlyfacedifficultiesinselectinganappro-
priatematerialmodelanditsparameters,asthemeasurementconditionsmaydifferfrom
thoseinpracticalapplications. Kuczynskaetal.[20]performedmechanical/dynamictests
againstthesolderjointtoverifytheabilityofthesematerialmodelsandtheircoefficients
tomapthelifetimedifferencesdependingonthetemperaturerateunderfieldandtesting
conditions,aswellasonthemeanoperatingtemperature.
Consideringthemanyapplicationscenarios,whichmayrangefromlowtohightem-
peraturesandstrainrates,anemergingtrendencouragesuserstoobtaintheirownmaterial
coefficients[5]. Thisapproachemphasizestheimportanceoftailoringthecoefficientstothe
specificconditionsencounte

In [126]:
embedding_function = SentenceTransformerEmbeddings(model_name="distilbert-base-nli-stsb-mean-tokens")

In [24]:
db = Chroma.from_documents(docs, embedding_function)

In [26]:
db.similarity_search("stress distribution around the defects")

[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../data/genai_poc/raw/state_of_the_union.txt'}),
 Document(page_content='A former top litigator in private practice. A former federal public defender. And from

## Langchain Independent Method

In [105]:
df = pd.read_csv("../data/genai_poc/processed/text_extracted_sectional_data_25_10_2023.csv")

In [106]:
embeddings = HuggingFaceEmbeddings(model_name="distilbert-base-nli-stsb-mean-tokens")

In [107]:
embed = Embedding(embedding_model=embeddings)

In [108]:
df['embeddings'] = df['sections'].apply(lambda x: embed.create_embedding_text(x))

In [109]:
dff = df.drop(['sections'],axis=1)

In [110]:
dff.to_parquet("../data/genai_poc/processed/embedding_text_sentence_transformer.parquet.gzip",compression='gzip')

#### Read Embedding and Proceed Further

In [150]:
df = pd.read_parquet("../data/genai_poc/processed/embedding_text_sentence_transformer.parquet.gzip")

In [112]:
df.columns

Index(['doc_id', 'page', 'section_id', 'embeddings'], dtype='object')

In [113]:
df['id_process'] = df[['doc_id','page','section_id']].apply(lambda x: str(x[0]+1)+str(x[1])+str(x[2]), axis=1)

In [114]:
df['id_process_len'] = df['id_process'].str.len()

In [115]:
df['id_process'] = df['id_process'].apply(lambda x: x+'1'*(5-len(x)))

In [116]:
df.id_process = df.id_process.astype('int')

In [117]:
import numpy as np
embeddings = df.embeddings.to_list()
embeddings = np.array(embeddings)

In [118]:
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])
# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)
# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.id_process.values)

In [137]:
def vector_search(query, num_results=20, index=None, model=None):
    index = index
    vector = model.create_embedding_text(query)
    D, I = index.search(np.array([vector]).astype("float32"), k=num_results)
    return D, I

In [157]:
text_df = pd.read_csv("../data/genai_poc/processed/text_extracted_sectional_data_25_10_2023.csv")

In [171]:
text_df_page = pd.read_csv("../data/genai_poc/processed/text_extracted_page_data_31_10_2023.csv")

In [158]:
text_df['id_process'] = text_df[['doc_id','page','section_id']].apply(lambda x: str(x[0]+1)+str(x[1])+str(x[2]), axis=1)

text_df['id_process_len'] = text_df['id_process'].str.len()

text_df['id_process'] = text_df['id_process'].apply(lambda x: x+'1'*(5-len(x)))

text_df.id_process = text_df.id_process.astype('int')

In [161]:
query = "defects underwent severe deformation"

In [177]:
D, I = vector_search(query,num_results=10,index=index,model=embed)

In [178]:
text_df.id_process = text_df.id_process.astype('int') ### added on 21.03.2022
    
result_df = [text_df[text_df.id_process == idx]['sections'].values.tolist()[0] for idx in I[0]]

In [179]:
result_df_ids = [text_df[text_df.id_process == idx][['doc_id','page']].values.tolist()[0] for idx in I[0]]

In [180]:
result_df_page = [text_df_page[(text_df_page['doc_id']==x[0])&(text_df_page['page']==x[1])]['raw_texts_NarrativeText'].values.tolist()[0].replace("\n\n"," ") for x in result_df_ids]

In [181]:
for pg in result_df_page:
    print(pg)
    print("\n\n==============================\n\n")

tively. According to the characteristics of the fracture morphology, it can be determined complex, and included the fracture surface of two IMC layers and the solder material it- surface was perpendicular to the loading axis. They corresponded to the Q1, Q2, and Q3 regions in Figure 8b,c, respectively. However, since the fracture surfaces of Q1–Q3 in Fig-ure 7 were not on the same plane, the S4 fracture surface was formed due to the shear deformation. Obviously, there were significant differences between the tensile fracture Figure 8a showed the 3D CT reconstruction morphology of the solder joints after the fractures. As could be seen, the final fracture morphology of the solder joint was relatively self. Figure 8b,c showed the fracture morphology of the lower and upper parts, respec- that a tensile fracture occurred in the Q1, Q2, and Q3 regions in Figure 7, and the fracture defect in the upper right corner of the Q2 region was torn. However, there was no obvious Figure 6. Engineering