In [1]:
!pip install pyspark
!pip install -U sentence-transformers



In [2]:
from pyspark.sql import SparkSession
from sentence_transformers import SentenceTransformer, util

In [3]:
spark=SparkSession.builder.appName("first-model").getOrCreate()

23/04/23 11:34:52 WARN Utils: Your hostname, LAPTOP-A0VE4E4B resolves to a loopback address: 127.0.1.1; using 172.17.156.210 instead (on interface eth0)
23/04/23 11:34:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/23 11:34:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/23 11:35:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
df = spark.read.option("multiline", "true").json("resources/first-model/2023-04-22.json")
model_columns = ["client_msg_id", "text"]
df=df.select(model_columns)
df.printSchema()
df.show()

                                                                                

root
 |-- client_msg_id: string (nullable = true)
 |-- text: string (nullable = true)



                                                                                

+--------------------+--------------------+
|       client_msg_id|                text|
+--------------------+--------------------+
|                null|<@U054KAGRES0> ha...|
|671a0279-0402-428...|This is for testi...|
|24a98ea5-bf88-46e...|Question to be fo...|
|faa65dc9-b890-417...|Hopefully it can ...|
+--------------------+--------------------+



In [5]:
# Drops whenever there is null value in any of those 
df = df.na.drop(how="any", subset=model_columns)

In [6]:
from sentence_transformers import SentenceTransformer, util

# load Sentence Transformers model pre trained
model = SentenceTransformer('all-MiniLM-L6-v2')

FRACTION_OF_DATA=1.0

sentences = [sentence[0]
             .lower()
             .replace('br','')
             .replace('<',"") 
             .replace(">","")
             .replace('\\',"")
             .replace('\/',"")
             for sentence in df.select("text").sample(fraction=FRACTION_OF_DATA).rdd.collect()]

#see a sentence, and our length
print(sentences, f'\n\nLength Of Data {len(sentences)}')

['this is for testing purposes and real data creation.', 'question to be found in chats: how to download data from here?', 'hopefully it can see it.'] 

Length Of Data 3


                                                                                

In [7]:
our_sentence = 'How to download data from slack?'

# lets embed our sentence
my_embedding = model.encode(our_sentence)

# lets embed the corpus
embeddings = model.encode(sentences)

# Compute cosine similarity between my sentence, and each one in the corpus
cos_sim = util.cos_sim(my_embedding, embeddings)

# find the most similar
winners = []
for arr in cos_sim:
    for i, each_val in enumerate(arr):
        winners.append([sentences[i],each_val])

# lets get the top 2 sentences
final_winners = sorted(winners, key=lambda x: x[1], reverse=True)



for arr in final_winners:
    print(f'\nScore : \n\n  {arr[1]}')
    print(f'\nSentence : \n\n {arr[0]}')


Score : 

  0.6016475558280945

Sentence : 

 question to be found in chats: how to download data from here?

Score : 

  0.17851555347442627

Sentence : 

 this is for testing purposes and real data creation.

Score : 

  0.04939895123243332

Sentence : 

 hopefully it can see it.
