# Discover duplicate content across documents 

In [294]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter


### Read the documents

We will read the document stored in the directory where we placed it. We will read the contents of the document along with file name and store it in a datafarme.

In [295]:
import os
import pandas as pd

## We need to read the documents and create a dataframe that contains the name of the document in one column and the text in another column
docs_dir = "data_docs"

# Get the list of files in the directory
file_list = os.listdir(docs_dir)

# Initialize empty lists to store the document names and texts
doc_names = []
doc_texts = []

# Read each file and extract the name and text
for file_name in file_list:
    file_path = os.path.join(docs_dir, file_name)
    with open(file_path, 'r') as file:
        doc_names.append(file_name)
        doc_texts.append(file.read())

# Create a dataframe with the document names and texts
df = pd.DataFrame({'Document_Name': doc_names, 'Text': doc_texts})


In [296]:
df2 = df.reset_index(drop=True)
df2.columns

Index(['Document_Name', 'Text'], dtype='object')

### Chunk the documents with metadata 

After we have obtained the main dataframe of the document names and content, we will chunk it in small sizes, we will preserve the document name and any relevant metadata along.

In [297]:
from textwrap3 import wrap
def chunk_section(section, chunk_size, chunk_overlap):
    #print(section["Document_Name"])
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n","\n"," ",""],
        chunk_size = chunk_size,
        chunk_overlap= chunk_overlap,
        length_function = len
    )
    chunks = text_splitter.create_documents(
        texts=[section["Text"]],
        metadatas=[{"name":section["Document_Name"]}]               
    )
    #print(chunks)
    return[{"text":chunk.page_content,
            "name":chunk.metadata["name"]} for chunk in chunks]

chunked_data = df2.apply(lambda x: chunk_section(x, 1000, 100), axis=1)

# Flatten the list of lists
chunked_data = [item for sublist in chunked_data for item in sublist]

# Create a dataframe with the chunked data
df_chunked = pd.DataFrame(chunked_data)



In [298]:
df_chunked.head()

df_chunked['chunk_no'] = df_chunked.groupby('name')['name'].transform(lambda x: range(1,len(x)+1))
df_chunked.head()

Unnamed: 0,text,name,chunk_no
0,'In 1495 the Italian Renaissance arrived in Fr...,document_3.txt,1
1,"Copernicus, in De revolutionibus orbium coeles...",document_3.txt,2
2,injury and chronic damage to the cardiovascula...,document_3.txt,3
3,turned within as that which was turned without...,document_3.txt,4
4,variants studied (including Omicron) killed br...,document_3.txt,5


### Vector embedding of the documents with metadata

We will use Chroma vector store and store contents of the files there to use it later on.

In [299]:
persist_dir = "doc/chroma/"

metadata = df_chunked[['name']].to_dict('records')


embedding = OpenAIEmbeddings()

vectorDB_new = Chroma.from_texts(
    texts = df_chunked['text'].tolist(),
    metadatas = metadata,
    embedding = embedding,
    persist_directory = persist_dir
)

### Documents duplication search


In [300]:
## Now that we have stored the vectors in the vector store, we can query the vector store to get the most similar documents to a given query document


quyery = """ The band still struggled to sign a record deal.
            They turned to Jeff Blue for additional help after
            facing numerous rejections from several major record labels.
            After failing to catch Warner Bros. Records on three previous reviews,
            Blue, who was now the vice president of Warner Bros. Records,
            helped the band sign a deal with the company as a developing artist in 1999.
            However, the label advised the band to change their name to avoid confusion
            with Hybrid.The band considered the names "Plear" and "Platinum Lotus Foundation"
            before deciding on "Linkin Park",a play on and homage to Santa Monica's Lincoln Park,
            now called Christine Emerson Reed Park.[23] They initially wanted to use the name "Lincoln Park",
            however they changed it to "Linkin" to acquire the internet domain "linkinpark.com". """


# Encode the query using the embeddings
query_embedding = embedding.embed_query(quyery)
#embedding.encode
# Search the vector store for the most similar chunks to the query
print(query_embedding[0:10])


most_similar_chunks = vectorDB_new.similarity_search_with_score(quyery,5)

# Print the chunks with the highest overlap
for chunk in most_similar_chunks:
    print(chunk)

    


[-0.008627031732097298, -0.0036220105000574275, 0.011151361023649371, -0.03407845195521086, -0.01834167311777821, 0.015924760706875613, -0.012030848957575474, -0.004706263899989682, -0.00932525076954443, -0.023135214513864102]
(Document(page_content='the band sign a deal with the company as a developing artist in 1999.\nHowever, the label advised the band to change their name to avoid\nconfusion with Hybrid. The band considered the names "Plear" and\n"Platinum Lotus Foundation" before deciding on "Linkin Park", a play\non and homage to Santa Monica\\\'s Lincoln Park, now called Christine\nEmerson Reed Park. They initially wanted to use the name "Lincoln\nPark", however they changed it to "Linkin" to acquire the internet\ndomain "linkinpark.com".\\n\', \'On January 19, 2010, Linkin Park\nreleased a new song titled "Not Alone" as part of a compilation from\nMusic for Relief called Download to Donate for Haiti in support of the\nHaiti Earthquake crisis. On February 10, 2010, Linkin Park r

In [301]:
## Use a document segment from one of the documents and search through the rest of the documents to identify where you can find the same 

query = df_chunked.loc[0, 'text']
print(query)

## Now we need to search for the query in the rest of the documents
# Search for the query in the rest of the documents
most_similar_chunks = vectorDB_new.similarity_search_with_score(query, 5)

# Print the chunks with the highest overlap
for chunk in most_similar_chunks:
    print(chunk)



'In 1495 the Italian Renaissance arrived in France, imported by King
Charles VIII after his invasion of Italy. A factor that promoted the
spread of secularism was the inability of the Church to offer
assistance against the Black Death. Francis I imported Italian art and
artists, including Leonardo da Vinci, and built ornate palaces at
great expense. Writers such as Fran ois Rabelais, Pierre de Ronsard,
Joachim du Bellay, and Michel de Montaigne, painters such as Jean
Clouet, and musicians such as Jean Mouton also borrowed from the
spirit of the Renaissance.\n', 'Some view this as a "scientific
revolution", heralding the beginning of the modern age, others as an
acceleration of a continuous process stretching from the ancient world
to the present day. Significant scientific advances were made during
this time by Galileo Galilei, Tycho Brahe, and Johannes Kepler.
Copernicus, in De revolutionibus orbium coelestium (On the Revolutions


(Document(page_content='\'In 1495 the Italian Renaissance arrived in France, imported by King\nCharles VIII after his invasion of Italy. A factor that promoted the\nspread of secularism was the inability of the Church to offer\nassistance against the Black Death. Francis I imported Italian art and\nartists, including Leonardo da Vinci, and built ornate palaces at\ngreat expense. Writers such as Fran ois Rabelais, Pierre de Ronsard,\nJoachim du Bellay, and Michel de Montaigne, painters such as Jean\nClouet, and musicians such as Jean Mouton also borrowed from the\nspirit of the Renaissance.\\n\', \'Some view this as a "scientific\nrevolution", heralding the beginning of the modern age, others as an\nacceleration of a continuous process stretching from the ancient world\nto the present day. Significant scientific advances were made during\nthis time by Galileo Galilei, Tycho Brahe, and Johannes Kepler.\nCopernicus, in De revolutionibus orbium coelestium (On the Revolutions', metadata={'n

In [302]:
### Finding common texts appraring common in all the documents

scores = []
metadata = []

for i in range(0, len(df_chunked)):
    query = df_chunked.loc[i, 'text']
    most_similar_chunks = vectorDB_new.similarity_search_with_score(query, 5)
    scores.append([chunk[1] for chunk in most_similar_chunks])
    metadata.append([chunk[0].metadata for chunk in most_similar_chunks])


In [303]:
scores_df = pd.DataFrame(scores)
scores_df.columns = ['Rank_1', 'Rank_2', 'Rank_3', 'Rank_4', 'Rank_5']

import regex as re
import pandas as pd

metalist = []

for i in range(0, len(metadata)):
    this_metadata = metadata[i]
    #print(this_metadata)
    pattern = r'document_\d+'
    texts = [re.search(pattern, m['name']).group() for m in this_metadata]
    metalist.append(texts)
    #print(metalist)

metadata_df = pd.DataFrame(metalist)

metadata_df.columns = [f'Document_{i}' for i in range(1, 6)]
metadata_df.head()

Unnamed: 0,Document_1,Document_2,Document_3,Document_4,Document_5
0,document_3,document_3,document_3,document_39,document_39
1,document_3,document_3,document_3,document_21,document_21
2,document_3,document_3,document_3,document_20,document_20
3,document_3,document_3,document_3,document_35,document_35
4,document_3,document_3,document_3,document_29,document_29


In [304]:
# Concatenate the scores and metadata dataframes
df_final = pd.concat([df_chunked,scores_df, metadata_df], axis=1)
df_final.head(5)

Unnamed: 0,text,name,chunk_no,Rank_1,Rank_2,Rank_3,Rank_4,Rank_5,Document_1,Document_2,Document_3,Document_4,Document_5
0,'In 1495 the Italian Renaissance arrived in Fr...,document_3.txt,1,0.0,0.0,2.5e-05,0.073827,0.073827,document_3,document_3,document_3,document_39,document_39
1,"Copernicus, in De revolutionibus orbium coeles...",document_3.txt,2,2.3e-05,2.3e-05,2.3e-05,0.204725,0.204963,document_3,document_3,document_3,document_21,document_21
2,injury and chronic damage to the cardiovascula...,document_3.txt,3,7.7e-05,7.7e-05,7.7e-05,0.098618,0.098618,document_3,document_3,document_3,document_20,document_20
3,turned within as that which was turned without...,document_3.txt,4,4e-06,4e-06,4e-06,0.32825,0.32825,document_3,document_3,document_3,document_35,document_35
4,variants studied (including Omicron) killed br...,document_3.txt,5,3e-06,3e-06,3e-06,0.222909,0.222909,document_3,document_3,document_3,document_29,document_29


In [305]:
df_final.text[6]

'\'Humans appear to be capable of spreading the virus to some other\nanimals, a type of disease transmission referred to as\nzooanthroponosis.\\n\', "By the 15th century, writers, artists, and\narchitects in Italy were well aware of the transformations that were\ntaking place and were using phrases such as modi antichi (in the\nantique manner) or alle romana et alla antica (in the manner of the\nRomans and the ancients) to describe their work. In the 1330s Petrarch\nreferred to pre-Christian times as antiqua (ancient) and to the\nChristian period as nova (new). From Petrarch\'s Italian perspective,\nthis new period (which included his own time) was an age of national\neclipse.\\nLeonardo Bruni was the first to use tripartite periodization\nin his History of the Florentine People (1442). Bruni\'s first two\nperiods were based on those of Petrarch, but he added a third period\nbecause he believed that Italy was no longer in a state of decline.'

In [306]:
from PIL import Image, ImageDraw, ImageFont
import difflib
import re

def text_compare_docs(inputFile,inputText):
    # Read the text file
    #inputFile = 'data_docs/document_13.txt'
    #inputText = df_Final.text[0]
    with open(inputFile, 'r') as file:
        document_lines = re.split('\n|\\n', file.read())

    # Create an image object
    image = Image.new('RGB', (1000, 500), color = (73, 109, 137))

    # Create a drawing object
    d = ImageDraw.Draw(image)

    # Define the font for the text
    fnt = ImageFont.truetype('/Library/Fonts/Arial.ttf', 5)

    # Define the font for the highlighted text
    highlight_fnt = ImageFont.truetype('/Library/Fonts/Arial.ttf', 5)

    highlight_text = inputText
    # The text to highlight
    highlight_lines = highlight_text.split('\n')
    print(len(highlight_lines))
    print(highlight_text)
    # Add the text to the image
    for i, line in enumerate(document_lines):
        # Check if the line is in the highlight text
        if any(difflib.SequenceMatcher(None, line, hl_line).ratio() > 0.5 for hl_line in highlight_lines):
            
            # Draw the highlight
            #print(line)
            d.text((1, i*2), line, font=highlight_fnt, fill=(255, 255, 0))
        else:
            # Draw the text
            d.text((1, i*2), line, font=fnt, fill=(0, 0, 0))

    # Show the image
    image.show()

In [307]:
inputFile = 'data_docs/document_17.txt'
inputText = df_final.text[20]

text_compare_docs(inputFile,inputText)

14
language game.\n', 'Linkin Park combines elements of metal,
industrial, punk, pop, electronic, hip hop. More specifically, the
band has been categorized as alternative rock, nu metal, rap rock, rap
metal, alternative metal, electronic rock, pop rock, hard rock, and
industrial rock.note 1 Despite being considered nu metal, the band
never considered themselves as such.\n', "Like structuralists, post-
structuralists start from the assumption that people's identities,
values, and economic conditions determine each other rather than
having intrinsic properties that can be understood in isolation.
Structuralists explore how the subjects of their study might be
described as a set of essential relationships, schematics, or
mathematical symbols. Post-structuralism, by contrast, is
characterized by new ways of thinking through structuralism, contrary
to the original form.\n"


### Comparing two documents at a time
