In [1]:
from langchain_community.embeddings.ollama import OllamaEmbeddings
from chromadb.config import Settings
from langchain_community.vectorstores.chroma import Chroma

setting = Settings(
    anonymized_telemetry=False,
    is_persistent=True,
)

embeddings = OllamaEmbeddings(model="phi3")

vector_store = Chroma(
    persist_directory='.DB',
    embedding_function=embeddings,
    client_settings=setting,
)

In [2]:
docs = vector_store.get()
docs

{'ids': ['00b54202-f2a2-4e18-9165-8f809ae0b928',
  '028934ec-cad2-42c2-bd29-6601b524c24e',
  '028bb222-a38d-4416-8626-8998dd2646d5',
  '035c8e10-f390-4ad6-ab76-fe4a1212bf1e',
  '03ab66b6-ca9f-4b75-94fd-217b9ddc40ec',
  '05606d0c-2ba9-4292-a19a-926ff849306d',
  '059b3443-6938-4ec3-bfad-6bb28661b963',
  '08916e50-b404-448e-ab29-5c8b359b124a',
  '0d192548-bcfc-438b-9f1c-50b837506c88',
  '0f52864f-328b-4d40-b192-c326636c394c',
  '0ff7f3c8-0b94-475e-a109-9f12808f2e39',
  '101ee384-c1d1-40e1-8e51-92e87c71d035',
  '1083e26f-8e14-47fb-96ca-d054ed7751fa',
  '11c23770-8dd1-4192-9e0a-e4f01de116d7',
  '13eab5dc-2bbd-4abe-9db1-b3b525e0e68a',
  '143bcce7-5e08-4e9d-a877-3224da45e170',
  '15411f1c-5c8c-4fef-923e-2653ed707550',
  '1571f7fb-a950-49ab-a5e0-f49a26294b2c',
  '17a6e0c2-5d8c-4e64-b4f4-6cc153f271f0',
  '181c25a4-59bf-4cfa-bf55-9fef1758d194',
  '1b2272f4-d8b3-4a12-b7f0-e17da25a6dce',
  '1cc42f31-80c9-4be8-9e96-df6975f5b50a',
  '1d04b0b7-ae4a-476e-ad5d-7bd600b8c45a',
  '1eba124a-931e-42c1-837e-

In [8]:
metadatas = docs["metadatas"]

In [13]:
files = [metadata['source'].split('/')[-1] for metadata in metadatas]
files

['DEK-Technologies-Vietnam-Code-of-Conduct-Policy-Vietnamese-version.pdf',
 'DEK Technologies Vietnam Travel Policy.pdf',
 'DEK Technologies Quality Policy.pdf',
 'DEK Technologies Vietnam Annual Trip Policy.pdf',
 'DEK Technologies Vietnam 1on1 Policy A.pdf',
 'DEK Technologies Vietnam Leave Policy.pdf',
 'DEK Technologies Vietnam Performance Evaluation Policy.pdf',
 'DEK Technologies Vietnam Hybrid Policy A.pdf',
 'DEK Technologies Vietnam Fresher Training Period Policy.pdf',
 'DEK Technologies Vietnam Remote Work Policy A.pdf',
 'DEK Technologies Vietnam Recruitment Referral Policy.pdf',
 'DEK Technologies Human Resources Policy.pdf',
 'DEK Technologies Vietnam Office Access Policy.pdf',
 'DEK Technologies Human Resources Policy.pdf',
 'DEK Technologies Vietnam Foundation Policy.pdf',
 'DEK Technologies Vietnam Domestic Travel Policy A.pdf',
 'DEK Technologies Employee ICT Policy and Procedure.pdf',
 'DEK Technologies Quality Policy.pdf',
 'DEK Technologies Vietnam Probation Period 

In [21]:
from collections import Counter
import pandas as pd

counter = Counter(files)
df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
df = df.rename(columns={'index':'file name', 0:'count'})
df

Unnamed: 0,file name,count
0,DEK-Technologies-Vietnam-Code-of-Conduct-Polic...,2
1,DEK Technologies Vietnam Travel Policy.pdf,2
2,DEK Technologies Quality Policy.pdf,3
3,DEK Technologies Vietnam Annual Trip Policy.pdf,3
4,DEK Technologies Vietnam 1on1 Policy A.pdf,2
5,DEK Technologies Vietnam Leave Policy.pdf,2
6,DEK Technologies Vietnam Performance Evaluatio...,2
7,DEK Technologies Vietnam Hybrid Policy A.pdf,3
8,DEK Technologies Vietnam Fresher Training Peri...,3
9,DEK Technologies Vietnam Remote Work Policy A.pdf,2


In [25]:
df.loc[df['count'].idxmin()]

file name    DEK Technologies Vietnam Tuition Reimbursement...
count                                                        1
Name: 41, dtype: object

In [26]:
df.loc[df['count'].idxmax()]

file name    DEK Technologies Employee ICT Policy and Proce...
count                                                        5
Name: 15, dtype: object

In [28]:
df['count'].median()

2.0