In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import random

import time

spark = SparkSession \
        .builder \
        .appName("Phone_Similarity") \
        .master("local[*]") \
        .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [2]:
db = 'myFirstDatabase'
tab = 'products'
df = spark.read.format('com.mongodb.spark.sql.DefaultSource')\
               .option('spark.mongodb.input.uri','mongodb://admin:123@project-shard-00-00.u9pno.mongodb.net:27017,project-shard-00-01.u9pno.mongodb.net:27017,project-shard-00-02.u9pno.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-pna2hx-shard-0&authSource=admin&retryWrites=true&w=majority') \
               .option('spark.mongodb.input.database',db) \
               .option('spark.mongodb.input.collection',tab).load()

In [3]:
from pyspark.sql.functions import col,isnan, when, count
from pyspark.sql.types import *
df1 = df.withColumn('_id', col('_id').cast(StringType()))
data = df1.withColumn('_id', split(col('_id'), ']').getItem(0))
data = data.withColumn('_id', expr("substring(_id, 2, length(_id))"))
phone_data = data.select('_id',
                                'title',
                                'category',
                                'color',
                                'memory',
                                'pin',
                                'ram',
                                'screenSize',
                                'status',
                                'price')

In [4]:
import pyspark.sql.functions as F 
categ = phone_data.select('category').distinct().rdd.flatMap(lambda x:x).collect()
exprs = [F.when(F.col('category') == cat,1).otherwise(0).alias(str(cat)) for cat in categ]
phone_data = phone_data.select(exprs + phone_data.columns)

In [5]:
categ = phone_data.select('color').distinct().rdd.flatMap(lambda x:x).collect()
exprs = [F.when(F.col('color') == cat,1).otherwise(0).alias(str(cat)) for cat in categ]
phone_data = phone_data.select(exprs + phone_data.columns)
phone_data.columns

['Shiny Black',
 'Turquoise',
 'Silver',
 'Green',
 'Purple',
 'Blue',
 'White',
 'Gold',
 'Mint Green',
 'Black',
 'Red',
 'Pink',
 '6194877b0327b0eef3a53fe9',
 '61947f86613ccbeacb59e5b8',
 '619487730327b0eef3a53fe4',
 '61947f8e613ccbeacb59e5bd',
 '_id',
 'title',
 'category',
 'color',
 'memory',
 'pin',
 'ram',
 'screenSize',
 'status',
 'price']

In [6]:
categ = phone_data.select('status').distinct().rdd.flatMap(lambda x:x).collect()
exprs = [F.when(F.col('status') == cat,1).otherwise(0).alias(str(cat)) for cat in categ]
phone_data = phone_data.select(exprs + phone_data.columns)
phone_data.columns

['99',
 'New',
 'Shiny Black',
 'Turquoise',
 'Silver',
 'Green',
 'Purple',
 'Blue',
 'White',
 'Gold',
 'Mint Green',
 'Black',
 'Red',
 'Pink',
 '6194877b0327b0eef3a53fe9',
 '61947f86613ccbeacb59e5b8',
 '619487730327b0eef3a53fe4',
 '61947f8e613ccbeacb59e5bd',
 '_id',
 'title',
 'category',
 'color',
 'memory',
 'pin',
 'ram',
 'screenSize',
 'status',
 'price']

In [7]:
from pyspark.sql.types import DoubleType
changedTypedf = phone_data.withColumn("screenSize", phone_data["screenSize"].cast(DoubleType()))
changedTypedf.head(5)

[Row(99=0, New=1, Shiny Black=0, Turquoise=0, Silver=0, Green=0, Purple=0, Blue=0, White=0, Gold=0, Mint Green=0, Black=0, Red=0, Pink=1, 6194877b0327b0eef3a53fe9=0, 61947f86613ccbeacb59e5b8=1, 619487730327b0eef3a53fe4=0, 61947f8e613ccbeacb59e5bd=0, _id='6194895c30e6b7130bb06add', title='iphone 13 pink', category='61947f86613ccbeacb59e5b8', color='Pink', memory=128, pin=3095, ram=6, screenSize=6.1, status='New', price=1100),
 Row(99=0, New=1, Shiny Black=0, Turquoise=0, Silver=0, Green=0, Purple=0, Blue=0, White=1, Gold=0, Mint Green=0, Black=0, Red=0, Pink=0, 6194877b0327b0eef3a53fe9=0, 61947f86613ccbeacb59e5b8=1, 619487730327b0eef3a53fe4=0, 61947f8e613ccbeacb59e5bd=0, _id='61948b652d9fa1d9e7da2d3a', title='iphone 13 pro max white', category='61947f86613ccbeacb59e5b8', color='White', memory=256, pin=3300, ram=6, screenSize=6.3, status='New', price=1300),
 Row(99=0, New=1, Shiny Black=0, Turquoise=0, Silver=0, Green=0, Purple=0, Blue=0, White=0, Gold=0, Mint Green=0, Black=1, Red=0, Pi

In [8]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=['99',
 'New',
 'Shiny Black',
 'Turquoise',
 'Silver',
 'Green',
 'Purple',
 'Blue',
 'White',
 'Gold',
 'Mint Green',
 'Black',
 'Red',
 'Pink',
 '6194877b0327b0eef3a53fe9',
 '61947f86613ccbeacb59e5b8',
 '619487730327b0eef3a53fe4',
 '61947f8e613ccbeacb59e5bd',
 'memory',
 'pin',
 'ram',
 'screenSize',
 'price'], outputCol='features')
assembled_data=assemble.transform(changedTypedf)

In [9]:
from pyspark.ml.feature import StandardScaler

scale=StandardScaler(inputCol='features',outputCol='standardized')

data_scale=scale.fit(assembled_data)
data_scale_output=data_scale.transform(assembled_data)

data_scale_output.show(2)

+---+---+-----------+---------+------+-----+------+----+-----+----+----------+-----+---+----+------------------------+------------------------+------------------------+------------------------+--------------------+--------------------+--------------------+-----+------+----+---+----------+------+-----+--------------------+--------------------+
| 99|New|Shiny Black|Turquoise|Silver|Green|Purple|Blue|White|Gold|Mint Green|Black|Red|Pink|6194877b0327b0eef3a53fe9|61947f86613ccbeacb59e5b8|619487730327b0eef3a53fe4|61947f8e613ccbeacb59e5bd|                 _id|               title|            category|color|memory| pin|ram|screenSize|status|price|            features|        standardized|
+---+---+-----------+---------+------+-----+------+----+-----+----+----------+-----+---+----+------------------------+------------------------+------------------------+------------------------+--------------------+--------------------+--------------------+-----+------+----+---+----------+------+-----+--------

In [10]:
datad = data_scale_output.select('_id', 'title', 'category', 'color', 'memory', 'pin', 'ram', 'screenSize', 'status', 'price', 'standardized')
datf = datad.toPandas()

In [11]:
#RMSE
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

class PhoneSimilarity():
    def __init__(self, all_Data):
        self.all_Data_ = all_Data
    
    def phone_similarity(self, phone_id, amount=1):
        distances = []
        phone = self.all_Data_[(self.all_Data_._id == phone_id)].head(1).values[0]
        phone_row = self.all_Data_[(self.all_Data_._id == phone_id)].head(1)
        phone_row['distance'] = 0
        current_standardized_vector = phone[10].toArray()
        res_data = self.all_Data_[self.all_Data_._id != phone_id]
        countElement = 23 #23 of vector and 1 of predict
        for r_phone in tqdm(res_data.values):
            dist = 0
            standardized_vector = r_phone[10].toArray()
            for col in np.arange(23):
                dist = dist + np.square(float(current_standardized_vector[col]) - float(standardized_vector[col]))
            dist = dist / countElement
            dist = np.sqrt(dist)
            distances.append(dist)
        res_data['distance'] = distances
        res_data = res_data.sort_values('distance')
        bigdata = pd.concat([phone_row, res_data], ignore_index=True, sort=False)
        columns = ['_id', 'title', 'category', 'color', 'memory', 'pin', 'ram', 'screenSize', 'status', 'price','distance']
        return bigdata[columns][:amount]

In [None]:
#euclidean
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

class PhoneSimilarity():
    def __init__(self, all_Data):
        self.all_Data_ = all_Data
    
    def phone_similarity(self, phone_id, amount=1):
        amount = amount + 1
        distances = []
        phone = self.all_Data_[(self.all_Data_._id == phone_id)].head(1).values[0]
        phone_row = self.all_Data_[(self.all_Data_._id == phone_id)].head(1)
        current_standardized_vector = np.array(phone[10].toArray())
        res_data = self.all_Data_[self.all_Data_._id != phone_id]
        countElement = 23 #23 of vector and 1 of predict
        for r_phone in tqdm(res_data.values):
            dist = 0
            standardized_vector = np.array(r_phone[10].toArray())
            dist = np.linalg.norm(current_standardized_vector-standardized_vector)
            distances.append(dist)
        res_data['distance'] = distances
        phone_row['distance'] = 0
        res_data = res_data.sort_values('distance')
        bigdata = pd.concat([phone_row, res_data], ignore_index=True, sort=False)
        columns = ['_id', 'title', 'category', 'color', 'memory', 'pin', 'ram', 'screenSize', 'status', 'price','distance']
        return bigdata[columns][:amount]

In [12]:
#test PhoneSimilarity
similarity = PhoneSimilarity(datf)
x = '6194c722bb6b5b34d3a62769'
similarity_phones = similarity.phone_similarity(x, 10)

100%|████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 8647.43it/s]


In [13]:
import kafka
kafka_topic_name = "clickcount"
kafka_bootstrap_servers = 'localhost:9092'

# Construct a streaming DataFrame that reads from topic
flower_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic_name) \
        .option("startingOffsets", "latest") \
        .load()

In [14]:
def get_database():
    from pymongo import MongoClient
    import pymongo

    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = "mongodb://admin:123@project-shard-00-00.u9pno.mongodb.net:27017,project-shard-00-01.u9pno.mongodb.net:27017,project-shard-00-02.u9pno.mongodb.net:27017/myFirstDatabase?ssl=true&replicaSet=atlas-pna2hx-shard-0&authSource=admin&retryWrites=true&w=majority"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    from pymongo import MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client['myFirstDatabase']

# Get the database
dbname = get_database()
similarities_collection = dbname["similarities"]
similarities_collection

Collection(Database(MongoClient(host=['project-shard-00-01.u9pno.mongodb.net:27017', 'project-shard-00-02.u9pno.mongodb.net:27017', 'project-shard-00-00.u9pno.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, replicaset='atlas-pna2hx-shard-0', authsource='admin', retrywrites=True, w='majority', tls=True), 'myFirstDatabase'), 'similarities')

In [15]:
def foreach_batch_function(df, epoch_id,similarity_f,similarities_coll):
    if df.first() is not None:
        row = df.first()
        value = row['value'].decode("utf-8")
        first_element = value.split(',')[0]
        similarity_phones = similarity_f.phone_similarity(first_element, 10)
        listId = []
        for i in range(1,6):
            listId.append(similarity_phones['_id'][i])
        query = {"idProduct":first_element}
        dict1 = {"idProduct":first_element, "listId":listId}
#         similarities_collection.insert_one(dict1)
        update = {"$set": dict1}
        similarities_coll.update_one(query,update,upsert=True)
        print(dict1)
        print(similarity_phones)
    pass

# query3 = flower_df.writeStream.foreachBatch(foreach_batch_function).start()
query3 = flower_df.writeStream.foreachBatch(lambda df,epochId: foreach_batch_function(df,epochId,similarity,similarities_collection)).start()
#streamingDF.writeStream.foreachBatch(lambda df,epochId: foreach_batch_function(df,epochId,similarity_f,similarities_collection)).start()

100%|███████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 14497.59it/s]


{'idProduct': '61a363be0813e6445a3f7fa3', 'listId': ['6194cc8cbb6b5b34d3a627a3', '6194cbb3bb6b5b34d3a62795', '6194c233bb6b5b34d3a62720', '6194c9f0bb6b5b34d3a62786', '6194c722bb6b5b34d3a62769']}
                        _id                        title  \
0  61a363be0813e6445a3f7fa3         xiaomi mi 11 lite 5g   
1  6194cc8cbb6b5b34d3a627a3  xiaomi mi 11 lite 5g yellow   
2  6194cbb3bb6b5b34d3a62795   xiaomi mi 11 lite 5g black   
3  6194c233bb6b5b34d3a62720         xiaomi mi 11 lite 4g   
4  6194c9f0bb6b5b34d3a62786        oppo reno6 z 5g black   
5  6194c722bb6b5b34d3a62769          oppo reno6 5g black   
6  6194b8b0bb6b5b34d3a626b1    samsung galaxy z flip3 5g   
7  6194bce4bb6b5b34d3a626e4     samsung galaxy a72 white   
8  6194ca4ebb6b5b34d3a6278e       oppo reno6 z 5g silver   
9  6194c059bb6b5b34d3a6270c       iphone 13 pro max gold   

                   category        color  memory   pin  ram  screenSize  \
0  6194877b0327b0eef3a53fe9   Mint Green     128  4250    8         6.

100%|███████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 14628.36it/s]


{'idProduct': '61a363be0813e6445a3f7fa3', 'listId': ['6194cc8cbb6b5b34d3a627a3', '6194cbb3bb6b5b34d3a62795', '6194c233bb6b5b34d3a62720', '6194c9f0bb6b5b34d3a62786', '6194c722bb6b5b34d3a62769']}
                        _id                        title  \
0  61a363be0813e6445a3f7fa3         xiaomi mi 11 lite 5g   
1  6194cc8cbb6b5b34d3a627a3  xiaomi mi 11 lite 5g yellow   
2  6194cbb3bb6b5b34d3a62795   xiaomi mi 11 lite 5g black   
3  6194c233bb6b5b34d3a62720         xiaomi mi 11 lite 4g   
4  6194c9f0bb6b5b34d3a62786        oppo reno6 z 5g black   
5  6194c722bb6b5b34d3a62769          oppo reno6 5g black   
6  6194b8b0bb6b5b34d3a626b1    samsung galaxy z flip3 5g   
7  6194bce4bb6b5b34d3a626e4     samsung galaxy a72 white   
8  6194ca4ebb6b5b34d3a6278e       oppo reno6 z 5g silver   
9  6194c059bb6b5b34d3a6270c       iphone 13 pro max gold   

                   category        color  memory   pin  ram  screenSize  \
0  6194877b0327b0eef3a53fe9   Mint Green     128  4250    8         6.

In [None]:

query3.stop()


In [None]:
similarities_collection.delete_many({})