In [None]:
# !pip install FlagEmbedding

import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
import requests


# cache_dir = '/tmp/huggingface_cache'
# os.makedirs(cache_dir, exist_ok=True)
# os.chmod(cache_dir, 0o777)
# os.environ['TRANSFORMERS_CACHE'] = cache_dir

model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "hf_token"
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}


# Initialize Spark session
spark = SparkSession.builder \
    .appName("KafkaRawCSVReader") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.pyspark.memory", "2g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0') \
    .getOrCreate()

# Read raw data from Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "csv_stream") \
    .option("startingOffsets", "earliest") \
    .load()


# Convert Kafka 'value' column to STRING
kafka_string_df = kafka_df.selectExpr("CAST(value AS STRING)")


semantic_schema = ("customer aged {Age} is a {Gender} with marital status {Marital_Status} is a {Occupation} with monthly income of {Monthly_Income} "
                   "has completed {Educational_Qualifications} with family size of {Family_size} belongs to latitude {latitude} and "
                   "longitude {longitude} with pincode {Pin_code} has given {Feedback} feedback")


def encode(final_sentence):
    response = requests.post(api_url, headers=headers, json={"inputs": final_sentence, "options":{"wait_for_model":True}})
    return response.json()


def process_row(row):
    csv = tuple(row.asDict().values())
    csv_values_temp = tuple(csv[0].split(','))
    
    csv_values = tuple(value.strip('"') for value in csv_values_temp)
    
    data = {
        "Age": csv_values[0],
        "Gender": csv_values[1],
        "Marital_Status": csv_values[2],
        "Occupation": csv_values[3],
        "Monthly_Income": csv_values[4],
        "Educational_Qualifications": csv_values[5],
        "Family_size": csv_values[6],
        "latitude": csv_values[7],
        "longitude": csv_values[8],
        "Pin_code": csv_values[9],
        "Feedback": csv_values[10]
    }

    
    final_sentence = semantic_schema.format(
        Age=data["Age"],
        Gender=data["Gender"],
        Marital_Status=data["Marital_Status"],
        Occupation=data["Occupation"],
        Monthly_Income=data["Monthly_Income"],
        Educational_Qualifications=data["Educational_Qualifications"],
        Family_size=data["Family_size"],
        latitude=data["latitude"],
        longitude=data["longitude"],
        Pin_code=data["Pin_code"],
        Feedback=data["Feedback"]
    )

    # print("final_sentence",final_sentence)
    embedded_sentence = encode(final_sentence)
    print("embedded_sentence",embedded_sentence)    
    

query = kafka_string_df.writeStream.foreach(process_row).start()




In [10]:
# !pip uninstall backports
# !pip install backports
!pip install --force-reinstall -v "setuptools<70"

Using pip 22.3 from /opt/conda/lib/python3.10/site-packages/pip (python 3.10)
Collecting setuptools<70
  Downloading setuptools-69.5.1-py3-none-any.whl (894 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m894.6/894.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 72.0.0
    Uninstalling setuptools-72.0.0:
      Removing file or directory /opt/conda/lib/python3.10/site-packages/_distutils_hack/
      Removing file or directory /opt/conda/lib/python3.10/site-packages/distutils-precedence.pth
      Removing file or directory /opt/conda/lib/python3.10/site-packages/pkg_resources/
      Removing file or directory /opt/conda/lib/python3.10/site-packages/setuptools-72.0.0.dist-info/
      Removing file or directory /opt/conda/lib/python3.10/site-packages/setuptools/
      Successfully uninstalled setuptools-72.0.0
Successfully insta

In [18]:
# !pip install pymilvus==2.4.4

from pymilvus import connections, db

conn = connections.connect(host="standalone", port=19530)

print(conn)

# database = db.create_database("my_database2")

print(db.list_database())

# print(database)

None
['default', 'my_database', 'my_database2']


In [None]:
# # Step 1: Define the semantic schema
# semantic_schema = ("customer aged {Age} is a {Gender} with marital status {Marital_Status} is a {Occupation} with {Monthly_Income} "
#                    "has completed {Educational_Qualifications} with family size of {Family_size} belongs to latitude {latitude} and "
#                    "longitude {longitude} with pincode {Pin_code} has given {Feedback} feedback")

# # Step 2: Define the CSV values as a tuple
# csv_values = ("26", "Male", "Married", "Employee", "More than 50000", "Graduate", "4", "12.9048", "77.6821", "560036", "Positive")

# # Step 3: Create a dictionary to map placeholder names to CSV values
# data = {
#     "Age": csv_values[0],
#     "Gender": csv_values[1],
#     "Marital_Status": csv_values[2],
#     "Occupation": csv_values[3],
#     "Monthly_Income": csv_values[4],
#     "Educational_Qualifications": csv_values[5],
#     "Family_size": csv_values[6],
#     "latitude": csv_values[7],
#     "longitude": csv_values[8],
#     "Pin_code": csv_values[9],
#     "Feedback": csv_values[10]
# }

# # Step 4: Use the .format() method to populate the template
# final_sentence = semantic_schema.format(
#     Age=data["Age"],
#     Gender=data["Gender"],
#     Marital_Status=data["Marital_Status"],
#     Occupation=data["Occupation"],
#     Monthly_Income=data["Monthly_Income"],
#     Educational_Qualifications=data["Educational_Qualifications"],
#     Family_size=data["Family_size"],
#     latitude=data["latitude"],
#     longitude=data["longitude"],
#     Pin_code=data["Pin_code"],
#     Feedback=data["Feedback"]
# )

# print(final_sentence)

In [None]:
# !pip install FlagEmbedding

# import os
# cache_dir = '/tmp/huggingface_cache'
# os.makedirs(cache_dir, exist_ok=True)
# os.chmod(cache_dir, 0o777)
# os.environ['TRANSFORMERS_CACHE'] = cache_dir

# from FlagEmbedding import BGEM3FlagModel
# model = BGEM3FlagModel('BAAI/bge-m3',  
#                        use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

# def encode(final_sentence):
#     sentence_embeddings = model.encode(final_sentence, 
#                             batch_size=12, 
#                             max_length=8192, # If you don't need such a long length, you can set a smaller value to speed up the encoding process.
#                             )['dense_vecs']
#     print(sentence_embeddings)
#     return sentence_embeddings

# encode(final_sentence)