In [10]:
# install sentence transformers
%pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
Note: you may need to restart the kernel to use updated packages.


Create Collection with name  `job_descriptionssentence` in Milvus and Generate embeddings Using Sentence Transformer for each job description in the dataset and insert into Milvus.

In [9]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Connect to Milvus with increased gRPC message size limits
connections.connect(
    alias='default',
    host='127.0.0.1',
    port='19530',
    options={
        'grpc.max_send_message_length': 512 * 1024 * 1024,  # 512MB
        'grpc.max_receive_message_length': 512 * 1024 * 1024  # 512MB
    }
)

# Read job descriptions from CSV
df = pd.read_csv("jobpostings.csv")

# Filter out rows with NA job descriptions
df = df.dropna(subset=['Job Description'])

# Load your Sentence Transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for job descriptions
embeddings = model.encode(df['Job Description'].tolist())

# Verify the shape of each embedding
for idx, emb in enumerate(embeddings):
    if emb.shape[0] != 384:
        print(f"Embedding dimension mismatch at index {idx}: {emb.shape}")
        raise ValueError(f"Embedding dimension mismatch at index {idx}: {emb.shape}")

# Define schema for the collection
fields = [
    FieldSchema(name='id', dtype=DataType.VARCHAR, is_primary=True, max_length=65535),
    FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, dim=384)  # Ensure this matches your model's output dimension
]
schema = CollectionSchema(fields=fields, description='job descriptions')

# Create a Milvus collection
collection_name = 'job_descriptionssentence'
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)
collection = Collection(name=collection_name, schema=schema)

# Prepare data for insertion
ids = df['Job Id'].astype(str).tolist()  # Convert IDs to strings for Milvus insert

# Ensure lengths match
assert len(ids) == len(embeddings), "Lengths of IDs and embeddings must match."

# Batch insert with smaller batches
batch_size = 1000
for i in range(0, len(ids), batch_size):
    batch_ids = ids[i:i+batch_size]
    batch_embeddings = embeddings[i:i+batch_size]

    entities = [
        {'id': str(batch_ids[j]), 'embedding': batch_embeddings[j].tolist()}
        for j in range(len(batch_ids))
    ]

    # Verify the dimension of each embedding before insertion
    for entity in entities:
        assert len(entity['embedding']) == 384, f"Embedding dimension is not 384: {len(entity['embedding'])}"

    # Now insert 'entities' into Milvus collection
    collection.insert(entities)

# Create IVF_FLAT index for collection
index_params = {
    'metric_type': 'L2',
    'index_type': 'IVF_FLAT',
    'params': {'nlist': 2048}
}
collection.create_index(field_name='embedding', index_params=index_params)

# Load the collection to memory
collection.load()

print(f"Collection '{collection_name}' created and populated with job descriptions.")




Batches:   0%|          | 0/5871 [00:00<?, ?it/s]

Collection 'job_descriptionssentence' created and populated with job descriptions.


Reterieve Embeddings of Following Job_ids

In [31]:
from pymilvus import Collection, connections

# Connect to Milvus server with increased gRPC message size limits
connections.connect(
    alias='default',
    host='127.0.0.1',
    port='19530',
    options={
        'grpc.max_send_message_length': 512 * 1024 * 1024,  # 512MB
        'grpc.max_receive_message_length': 512 * 1024 * 1024  # 512MB
    }
)

# Define job IDs to retrieve embeddings for
job_ids_to_check = [
    '956bafedb9d1602f102c47e451022211',
    '5083f71d6399f4821f567f8ee51c150f',
    'ef1466862c81c370fb02e19d2e080535',
    'd3fa279d01e7f098062f0b5faf35aeb6',
    '1055ace0298d9eebd4798dc4451eccdc',
    'fd85b5d91f15aded8bf4510d2c1abd7b'
]

collection_name = 'job_descriptionssentence'
collection = Collection(collection_name)
collection.load()

# Function to retrieve embeddings from Milvus by job IDs
def retrieve_embeddings_by_ids(job_ids):
    try:
        # Constructing the query expression using 'in' operator
        expr = f"id in {job_ids}"
        
        # Specify the output fields you want to retrieve
        output_fields = ["embedding"]
        
        # Perform the query
        result = collection.query(expr=expr, output_fields=output_fields)
        print(result)

        embeddings = []
        for item in result:
            embedding = item.get("embedding", None)
            if embedding:
                embeddings.append(embedding)
            else:
                embeddings.append(None)

        # Print or further process the embeddings found
        for i, emb in enumerate(embeddings):
            if emb is not None:
                print(f"Job ID: {job_ids[i]}, Embedding: {emb}")
            else:
                print(f"No embedding found for Job ID: {job_ids[i]}")

        return embeddings

    except Exception as e:
        print(f"Error querying Milvus: {e}")
        return [None] * len(job_ids)

# Retrieve embeddings for job IDs
embeddings_to_check = retrieve_embeddings_by_ids(job_ids_to_check)

# # Print embeddings for verification
# for i, emb in enumerate(embeddings_to_check):
#     if emb is not None:
#         print(f"Job ID: {job_ids_to_check[i]}, Embedding: {emb}")
#     else:
#         print(f"No embedding found for Job ID: {job_ids_to_check[i]}")


[{'embedding': [-0.04296622, -0.03901067, -0.019625483, -0.034219757, -0.07576302, 0.063592985, -0.042826325, 0.0020867856, -0.04699288, -0.06211826, -0.066837884, 0.06822865, 0.043280736, 0.02844485, -0.07734968, 0.012566829, 0.051643815, -0.02151561, -0.046951693, 0.018095253, -0.03594338, 0.08661296, 0.0026234058, -0.0143878525, -0.02891413, -0.008673351, -0.020062353, -0.01874034, -0.021538354, -0.021862945, 0.031523746, -0.056021053, 0.013680414, -0.0060706846, 0.032408338, 0.078625016, 0.02898307, 0.0090119345, -0.04562752, 0.049408454, -0.042442545, -0.013509548, 0.03407647, 0.02614169, -0.034661394, -0.00088106113, -0.013317743, -0.03238167, 0.056639172, 0.03335893, -0.050559286, -0.052586503, 0.0005376794, 0.1120225, -0.05114574, 0.0077588353, 0.0034339647, -0.07163622, -0.1011816, -0.025530793, -0.022884583, 0.07270682, 0.035377726, 0.00867385, 0.058347225, 0.009065741, 0.0014172469, -0.037082303, 0.06736538, -0.10687812, -0.0025424163, -0.051150046, -1.6115839e-05, 0.0918207

How did you decide on the threshold for determining duplicates in Milvus? Which metrics are you using?
I am using IP METRIC for evaluation and for for thresholding determination I took validation dataset of 4 job ids to verify 

In [29]:
import torch
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection
import pandas as pd
import numpy as np

# Connect to Milvus with increased gRPC message size limits
connections.connect(
    alias='default',
    host='127.0.0.1',
    port='19530',
    options={
        'grpc.max_send_message_length': 512 * 1024 * 1024,  # 512MB
        'grpc.max_receive_message_length': 512 * 1024 * 1024  # 512MB
    }
)

# Function to generate Sentence Transformer embeddings from text
def generate_sentence_embeddings(text):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode([text])  # Generate embeddings for the entire text
    return embeddings[0]  # Take the embedding for the whole text

# Function to search for duplicates in Milvus
def search_duplicates_for_job_ids(job_ids, threshold):
    try:
        # Specify the collection name to search
        collection_name = 'job_descriptionssentence'
        collection = Collection(name=collection_name)  # Connect to the collection
        collection.load()

        # Read job descriptions from CSV to lookup the descriptions by ID
        df = pd.read_csv("jobpostings.csv")
        id_to_description = dict(zip(df['Job Id'].astype(str), df['Job Description']))

        # Initialize a list to store results
        results_list = []

        # Process each job ID to find duplicates
        for job_id in job_ids:
            query_text = id_to_description.get(job_id, "")
            if query_text:
                # Generate Sentence Transformer embedding for query
                query_embedding = generate_sentence_embeddings(query_text)

                # Search for similar embeddings in Milvus collection
                search_params = {
                    'metric_type': 'IP',
                    'params': {'nprobe': 10}  # Adjust nprobe as needed for better accuracy
                }

                # Perform the search
                results = collection.search(
                    data=[query_embedding.tolist()],  # Query embedding as list
                    anns_field='embedding',           # The name of the field that contains the embeddings
                    param=search_params,              # Search parameters
                    limit=5                      # Number of results to return
                )

                # Store the results
                for hit in results[0]:  # results[0] because we only have one query embedding
                    similarity_score =  hit.distance  # Convert distance to similarity score
                    if similarity_score >= threshold:
                        if hit.id != job_id:
                            results_list.append({
                                'Job ID': job_id,
                                'Similar Job ID': hit.id,
                                'Similarity Score': similarity_score,
                                'Duplicate Job Description': id_to_description.get(hit.id, "Description not found"),
                                'Original Job Description': id_to_description.get(job_id, "Description not found")
                            })

        return results_list

    except Exception as e:
        print(f"Error searching duplicates in Milvus: {e}")
        return []

# List of job IDs to evaluate for duplicates
job_ids_to_evaluate = [
    '956bafedb9d1602f102c47e451022211',
    '5083f71d6399f4821f567f8ee51c150f',
    'ef1466862c81c370fb02e19d2e080535',
    'd3fa279d01e7f098062f0b5faf35aeb6',
    '1055ace0298d9eebd4798dc4451eccdc',
    'fd85b5d91f15aded8bf4510d2c1abd7b'
]

# Specify the similarity threshold to evaluate
similarity_threshold = 0.9 # Adjust this threshold as needed

# Search for duplicates for the specified job IDs and threshold
duplicate_results = search_duplicates_for_job_ids(job_ids_to_evaluate, similarity_threshold)

# Print the results
if duplicate_results:
    print(f"Duplicates found for the specified job IDs with threshold >= {similarity_threshold}:")
    for result in duplicate_results:
        print(f"Job ID: {result['Job ID']}")
        print(f"Similar Job ID: {result['Similar Job ID']}")
        print(f"Similarity Score: {result['Similarity Score']:.4f}")
        print(f"Duplicate Job Description: {result['Duplicate Job Description']}")
        print(f"Original Job Description: {result['Original Job Description']}")
        print('---------------------------------------------')
else:
    print("No duplicates found for the specified job IDs.")




Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Duplicates found for the specified job IDs with threshold >= 0.9:
Job ID: 956bafedb9d1602f102c47e451022211
Similar Job ID: c9d478834f4d94c8f1ec5939377a9a8e
Similarity Score: 1.0000
Duplicate Job Description: Abbott is a global healthcare leader that helps people live more fully at all stages of life. Our portfolio of life-changing technologies spans the spectrum of healthcare, with leading businesses and products in diagnostics, medical devices, nutritionals, and branded generic medicines. Our 109,000 colleagues serve people in more than 160 countries.    <br/> <br/>  <br/> <br/>    <b><b>JOB DESCRIPTION:</b></b>    <br/> <br/><b>About Abbott</b><br/> <br/>For years, Abbotts medical device businesses have offered technologies that are faster, more effective, and less invasive. Whether its glucose monitoring systems, innovative therapies for treating heart disease, or products that help people with chronic pain or movement disorders, our medical device technologies are designed to help 

Evaluate the results.

In [11]:
import torch
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection
import pandas as pd

# Connect to Milvus with increased gRPC message size limits
connections.connect(
    alias='default',
    host='127.0.0.1',
    port='19530',
    options={
        'grpc.max_send_message_length': 512 * 1024 * 1024,  # 512MB
        'grpc.max_receive_message_length': 512 * 1024 * 1024  # 512MB
    }
)

# Example query (question)
query_text = """
Work for One of the World's Best Hospitals! THREE YEARS IN A ROW Newsweek has named Charlton Memorial, St. Luke's, and Tobey among the Best Hospitals in the World! Join the team and find out why! We are searching for a Registered Nurse to perform a variety of duties involving coordinating total nursing care for patients, participating in patient and family teaching and providing leadership by working cooperatively with ancillary nursing and other patient team personnel in maintaining standards for professional nursing practice within an assigned patient care unit. Has knowledge and skills necessary to provide care, which is appropriate to the age group of patients regularly served. May assume Charge responsibility for assigned personnel and activities as required or directed. Exciting opportunity on SK2, a 37 bed medical telemetry unit. We care for young adult patients through the continuum who experience the usual medical diagnoses. We are known for our team work and for the sense of community among the staff. The SK2 team is a highly committed group of individuals who care for not only the patient but for each other. EXPERIENCED NEW HIRES- ASK ABOUT ELIGIBILITY FOR $4,000 SIGN ON BONUS! Location: Tobey Hospital- Wareham, MA Hours: 32hrs (Part Time with Benefits) Shift: NIGHT, 11:15pm-7:15am with weekend & holiday rotations. Education and/or Experience Requirements: Graduate of an accredited School of Nursing or equivalent is required. Current RN Licensure in the Commonwealth of Massachusetts is required. Basic Life Support (BLS) certification is required. Three (3) months experience is preferred.
"""

# Function to generate Sentence Transformer embeddings from text
def generate_sentence_embeddings(text):
    # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Example model, replace with your preferred model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode([text])  # Generate embeddings for the entire text
    return embeddings[0]  # Take the embedding for the whole text
    
# Specify the collection name to search
collection_name = 'job_descriptionssentence'

# Search for similar embeddings in Milvus collection
collection = Collection(name=collection_name)  # Connect to the collection    

# Generate Sentence Transformer embedding for query
query_embedding = generate_sentence_embeddings(query_text)

# Search for similar embeddings in Milvus collection
search_params = {
    'metric_type': 'L2',
    'params': {'nprobe': 10}  # Adjust nprobe as needed for better accuracy
}

# Perform the search
results = collection.search(
    data=[query_embedding.tolist()],  # Query embedding as list
    anns_field='embedding',           # The name of the field that contains the embeddings
    param=search_params,              # Search parameters
    limit=10                          # Number of results to return
)

# Read job descriptions from CSV to lookup the descriptions by ID
df = pd.read_csv("jobpostings.csv")
id_to_description = dict(zip(df['Job Id'].astype(str), df['Job Description']))

# Process search results
for hits in results:
    for hit in hits:
        job_description = id_to_description.get(hit.id, "Description not found")
        print(f"Job Description ID: {hit.id}")
        print(f"Job Description: {job_description}")
        print(f"Similarity Score: {hit.distance}")
        print('---------------------------------------------')


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Job Description ID: 50e778aab9763d6938d222825b01fcee
Job Description: Work for One of the World#s Best Hospitals! THREE YEARS IN A ROW Newsweek has named Charlton Memorial, St. Luke#s, and Tobey among the Best Hospitals in the World! Join the team and find out why! We are searching for a Registered Nurse to perform a variety of duties involving coordinating total nursing care for patients, participating in patient and family teaching and providing leadership by working cooperatively with ancillary nursing and other patient team personnel in maintaining standards for professional nursing practice within an assigned patient care unit. Has knowledge and skills necessary to provide care, which is appropriate to the age group of patients regularly served.# May assume Charge responsibility for assigned personnel and activities as required or directed. Exciting opportunity on SK2, a 37 bed medical telemetry unit. We care for young adult patients through the continuum who experience the usual 

Evaluate the results Again:

In [18]:
import torch
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection
import pandas as pd

# Connect to Milvus with increased gRPC message size limits
connections.connect(
    alias='default',
    host='127.0.0.1',
    port='19530',
    options={
        'grpc.max_send_message_length': 512 * 1024 * 1024,  # 512MB
        'grpc.max_receive_message_length': 512 * 1024 * 1024  # 512MB
    }
)

# Example query (question)
query_text = """
Work for One of the World's Best Hospitals! THREE YEARS IN A ROW Newsweek has named Charlton Memorial, St. Luke's, and Tobey among the Best Hospitals in the World! Join the team and find out why! We are searching for a Registered Nurse to perform a variety of duties involving coordinating total nursing care for patients, participating in patient and family teaching and providing leadership by working cooperatively with ancillary nursing and other patient team personnel in maintaining standards for professional nursing practice within an assigned patient care unit. Has knowledge and skills necessary to provide care, which is appropriate to the age group of patients regularly served. May assume Charge responsibility for assigned personnel and activities as required or directed. Exciting opportunity on SK2, a 37 bed medical telemetry unit. We care for young adult patients through the continuum who experience the usual medical diagnoses. We are known for our team work and for the sense of community among the staff. The SK2 team is a highly committed group of individuals who care for not only the patient but for each other. EXPERIENCED NEW HIRES- ASK ABOUT ELIGIBILITY FOR $4,000 SIGN ON BONUS! Location: Tobey Hospital- Wareham, MA Hours: 32hrs (Part Time with Benefits) Shift: NIGHT, 11:15pm-7:15am with weekend & holiday rotations. Education and/or Experience Requirements: Graduate of an accredited School of Nursing or equivalent is required. Current RN Licensure in the Commonwealth of Massachusetts is required. Basic Life Support (BLS) certification is required. Three (3) months experience is preferred.
"""

# Function to generate Sentence Transformer embeddings from text
def generate_sentence_embeddings(text):
    # model = SentenceTransformer('paraphrase-MiniLM-L6-v2')  # Example model, replace with your preferred model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = model.encode([text])  # Generate embeddings for the entire text
    return embeddings[0]  # Take the embedding for the whole text
    
# Specify the collection name to search
collection_name = 'job_descriptionssentence'

# Search for similar embeddings in Milvus collection
collection = Collection(name=collection_name)  # Connect to the collection    

# Generate Sentence Transformer embedding for query
query_embedding = generate_sentence_embeddings(query_text)

# Search for similar embeddings in Milvus collection
search_params = {
    'metric_type': 'IP',
    'params': {'nprobe': 10}  # Adjust nprobe as needed for better accuracy
}

# Perform the search
results = collection.search(
    data=[query_embedding.tolist()],  # Query embedding as list
    anns_field='embedding',           # The name of the field that contains the embeddings
    param=search_params,              # Search parameters
    limit=10                          # Number of results to return
)

# Read job descriptions from CSV to lookup the descriptions by ID
df = pd.read_csv("jobpostings.csv")
id_to_description = dict(zip(df['Job Id'].astype(str), df['Job Description']))

# Process search results
for hits in results:
    for hit in hits:
        job_description = id_to_description.get(hit.id, "Description not found")
        print(f"Job Description ID: {hit.id}")
        print(f"Job Description: {job_description}")
        print(f"Similarity Score: {hit.distance}")
        print('---------------------------------------------')


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Job Description ID: 50e778aab9763d6938d222825b01fcee
Job Description: Work for One of the World#s Best Hospitals! THREE YEARS IN A ROW Newsweek has named Charlton Memorial, St. Luke#s, and Tobey among the Best Hospitals in the World! Join the team and find out why! We are searching for a Registered Nurse to perform a variety of duties involving coordinating total nursing care for patients, participating in patient and family teaching and providing leadership by working cooperatively with ancillary nursing and other patient team personnel in maintaining standards for professional nursing practice within an assigned patient care unit. Has knowledge and skills necessary to provide care, which is appropriate to the age group of patients regularly served.# May assume Charge responsibility for assigned personnel and activities as required or directed. Exciting opportunity on SK2, a 37 bed medical telemetry unit. We care for young adult patients through the continuum who experience the usual 

Collection Updated with IP Evaluation Metric:

In [17]:
from pymilvus import connections, Collection, utility

# Connect to Milvus with increased gRPC message size limits
connections.connect(
    alias='default',
    host='127.0.0.1',
    port='19530',
    options={
        'grpc.max_send_message_length': 512 * 1024 * 1024,  # 512MB
        'grpc.max_receive_message_length': 512 * 1024 * 1024  # 512MB
    }
)

# Specify the collection name
collection_name = 'job_descriptionssentence'

# Connect to the collection
collection = Collection(name=collection_name)

# Check if the collection is loaded
if collection.is_empty:  # Check if collection is already loaded
    collection.load()  # Load the collection

# Now release the collection from memory
collection.release()    

# Drop existing index if it exists
if collection.has_index():
    collection.drop_index()

# Create IVF_FLAT index with IP metric for collection
index_params = {
    'metric_type': 'IP',  # Change to IP (Inner Product)
    'index_type': 'IVF_FLAT',
    'params': {'nlist': 2048}
}
collection.create_index(field_name='embedding', index_params=index_params)

# Load the collection to memory
collection.load()

print(f"Collection '{collection_name}' updated with IP metric.")


Collection 'job_descriptionssentence' updated with IP metric.


In [2]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Going Through Dataset Visualization:

In [32]:
import pandas as pd
import numpy as np
# Load your dataset
df = pd.read_csv('jobpostings.csv')
df.head()

Unnamed: 0,Job Id,Job Title,SOC Code,Job Description,Company Name,Skills,Qualification,City,State,Zipcode,Job Opening Date,Job Closing Date,Status,Website Url
0,89c41c519c3c491929e3082f0ee1d557,"Editor, Celebrations",27-3041.00,"<br/><br/>Gannett Co., Inc. (NYSE: GCI) is a s...",Gannett,"[Local Media, Editing, Journalism]","[Bachelor of Journalism (B.J.), Master of Jour...",Boston,Massachusetts,2108.0,2021-09-23,2021-09-30,CLOSED,https://www.gannett.com
1,ac0c91f394fa77a00ad72ee3440cb4b7,Software Engineer II,51-8021.00,Overview </b> <br/><br/>Reporting to the...,ERT,"[Java, Application Architecture, CI, Data Stru...",[Bachelor of Computer Science (B.C.S.)],Medford,Massachusetts,2155.0,2021-04-29,2021-09-30,CLOSED,http://www.ert.com
2,4b5748411c4496f56ef33645a27840e0,Principal Software Architect,17-1011.00,<br/> <br/>Digital technology has forever chan...,"Sovos Compliance, LLC.","[Java, CSS, Government Compliance, Global Comp...",,Wilmington,Massachusetts,1887.0,2021-01-29,2021-09-30,CLOSED,https://sovos.com
3,7a7dac1bc98365216833008c0fbd063d,Strategy Program Manager,27-2012.03,<br/> <br/> <b>Build your future with Sovos</b...,"Sovos Compliance, LLC.","[Government Compliance, Global Compliance, Com...",,Wilmington,Massachusetts,1887.0,2021-08-27,2021-09-30,CLOSED,https://sovos.com
4,304aff90fd39fead183f48206f4070c7,Senior Contract Manager,11-9199.00,Description<br/> <br/>LaBella Associates was e...,LaBella Associates,"[Legal, Disciplinary]","[Bachelor of Engineering (B.E./B.Eng.), Any Ba...",Boston,Massachusetts,2108.0,2021-03-11,2021-09-30,CLOSED,http://www.labellapc.com


Installation Of Milvus Through Docker:

In [53]:
!wget https://github.com/milvus-io/milvus/releases/download/v2.2.8/milvus-standalone-docker-compose.yml -O docker-compose.yml


12935.18s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


E0615 23:04:43.784059945  139469 completion_queue.cc:738]              Kick failed: UNKNOWN:Bad file descriptor {syscall:"eventfd_write", os_error:"Bad file descriptor", errno:9, created_time:"2024-06-15T23:04:43.784003492+00:00"}
--2024-06-15 23:04:49--  https://github.com/milvus-io/milvus/releases/download/v2.2.8/milvus-standalone-docker-compose.yml
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/208728772/2344344b-d779-4031-928a-80acd20cc584?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240615T230449Z&X-Amz-Expires=300&X-Amz-Signature=2c5d65ebb3de2b1ecb3bbb7d75ad3a38ee7780dc70e20df56dca7327be998cb5&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=208728772&response-content-disposition=attachm

In [2]:
!docker compose up -d

[33mWARN[0m[0000] /teamspace/studios/this_studio/docker-compose.yml: `version` is obsolete 
[1A[1B[0G[?25l[+] Running 0/0
 [33m⠋[0m etcd Pulling                                                            [34m0.1s [0m
 [33m⠋[0m standalone Pulling                                                      [34m0.1s [0m
 [33m⠋[0m minio Pulling                                                           [34m0.1s [0m
[?25h[1A[1A[1A[1A[0G[?25l[+] Running 0/3
 [33m⠙[0m etcd Pulling                                                            [34m0.2s [0m
 [33m⠙[0m standalone Pulling                                                      [34m0.2s [0m
 [33m⠙[0m minio Pulling                                                           [34m0.2s [0m
[?25h[1A[1A[1A[1A[0G[?25l[+] Running 0/3
 [33m⠹[0m etcd Pulling                                                            [34m0.3s [0m
 [33m⠹[0m standalone [[32m⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀[0m] Pulling                               

In [1]:
!docker-compose ps


Name   Command   State   Ports
------------------------------


In [76]:
!docker logs milvus-standalone

15443.93s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


E0615 23:46:36.579615165  168638 backup_poller.cc:136]                 Run client channel backup poller: UNKNOWN:pollset_work {created_time:"2024-06-15T23:46:36.579535556+00:00", children:[UNKNOWN:Bad file descriptor {syscall:"epoll_wait", os_error:"Bad file descriptor", errno:9, created_time:"2024-06-15T23:46:36.57949216+00:00"}]}
[2024/06/15 23:45:52.178 +00:00] [INFO] [config/etcd_source.go:145] ["start refreshing configurations"]
[2024/06/15 23:45:52.178 +00:00] [INFO] [paramtable/quota_param.go:769] ["init disk quota"] [diskQuota(MB)=+inf]
[2024/06/15 23:45:52.178 +00:00] [INFO] [paramtable/quota_param.go:784] ["init disk quota per DB"] [diskQuotaPerCollection(MB)=1.7976931348623157e+308]
[2024/06/15 23:45:52.179 +00:00] [INFO] [paramtable/component_param.go:1550] ["init segment max idle time"] [value=10m0s]
[2024/06/15 23:45:52.179 +00:00] [INFO] [paramtable/component_param.go:1555] ["init segment min size from idle to sealed"] [value=16]
[2024/06/15 23:45:52.179 +00:00] [INFO] [

In [77]:
! python -m pip install -q pymilvus==2.2.11


15466.02s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


E0615 23:46:56.579541713  169050 backup_poller.cc:136]                 Run client channel backup poller: UNKNOWN:pollset_work {created_time:"2024-06-15T23:46:56.579452148+00:00", children:[UNKNOWN:Bad file descriptor {syscall:"epoll_wait", os_error:"Bad file descriptor", errno:9, created_time:"2024-06-15T23:46:56.579404237+00:00"}]}


In [82]:
!pip3 install pymilvus

15577.45s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


E0615 23:48:46.579530984  170990 backup_poller.cc:136]                 Run client channel backup poller: UNKNOWN:pollset_work {created_time:"2024-06-15T23:48:46.57945692+00:00", children:[UNKNOWN:Bad file descriptor {syscall:"epoll_wait", os_error:"Bad file descriptor", errno:9, created_time:"2024-06-15T23:48:46.579418762+00:00"}]}


In [30]:
%pip freeze > requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
