# Разработка Spark приложения. Домашнее задание 2.
Дубровин Е.Н. ИУ6-32М

In [1]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

#### Подключение к исполнителю

In [2]:
conf = pyspark.SparkConf() \
        .setAppName("Assignment2") \
        .setMaster("spark://spark-master:7077")

spark = SparkSession.builder.config(conf=conf).getOrCreate()
sc    = spark.sparkContext

## Задача 1. Средний рейтинг
Реализуйте подсчет среднего рейтинга продуктов. Результат сохранить в HDFS в файле "avg_rating.csv". Формат каждой записи: ProdId,Rating

In [3]:
avg_input = "hdfs://namenode:9000/reviews_Electronics_5.json"

In [4]:
avg_output = "hdfs://namenode:9000/avg_rating"

In [5]:
avg_df = spark.read.load(path=avg_input, format="json")

In [6]:
avg_count_df = avg_df[["asin", "overall"]]

In [7]:
avg_result_df = avg_count_df.groupBy("asin").avg()

In [8]:
avg_result_df.show(5)

+----------+-----------------+
|      asin|     avg(overall)|
+----------+-----------------+
|B00000J3Q1|              4.0|
|B00001W0DC|              4.2|
|B00003OPEV|4.333333333333333|
|B00005853W|              4.0|
|B00005Q5U5|4.787878787878788|
+----------+-----------------+
only showing top 5 rows



In [9]:
avg_result_df.write.csv(avg_output, header=True)

## Задача 2. Добавление наименования продукта
Напишите программу, которая каждому ProdId из "avg_rating.csv" ставит в соответстие названием продукта. Результат сохранить в HDFS в файле "prodname_avg_rating.csv": ProdId,Name,Rating

In [10]:
prodname_input_avg = "hdfs://namenode:9000/avg_rating"

In [11]:
prodname_input_meta = "hdfs://namenode:9000/meta_Electronics.json"

In [12]:
prodname_output = "hdfs://namenode:9000/prodname_avg_rating"

In [13]:
prodname_avg_df = spark.read.load(path=prodname_input_avg, format="csv", header=True)

In [14]:
prodname_avg_df.show(5)

+----------+------------------+
|      asin|      avg(overall)|
+----------+------------------+
|B00000J1F3|               4.5|
|B00000JCT8|3.6923076923076925|
|B00004TDLD| 4.944444444444445|
|B00004WLJ2| 4.304347826086956|
|B00004Z6XS| 4.041666666666667|
+----------+------------------+
only showing top 5 rows



In [15]:
prodname_meta_df = spark.read.load(path=prodname_input_meta, format="json")

In [16]:
prodname_meta_usefull_df = prodname_meta_df[["asin", "title"]].dropna()

In [17]:
prodname_meta_usefull_df.show(5)

+----------+--------------------+
|      asin|               title|
+----------+--------------------+
|0132793040|Kelby Training DV...|
|0321732944|Kelby Training DV...|
|0439886341|Digital Organizer...|
|0558835155|Polaroid Pbm2200 ...|
|0594012015|Barnes &amp; Nobl...|
+----------+--------------------+
only showing top 5 rows



In [18]:
prodname_merged_df = prodname_meta_usefull_df.join(prodname_avg_df, on="asin", how="inner")

In [19]:
prodname_merged_df.show(5)

+----------+--------------------+------------------+
|      asin|               title|      avg(overall)|
+----------+--------------------+------------------+
|0594451647|Barnes &amp; Nobl...|               4.2|
|0594481813|Barnes &amp; Nobl...|               4.0|
|1400532620|Barnes &amp; Nobl...|3.6097560975609757|
|1400532736|Nook Simple Touch...| 3.230769230769231|
|1400532655|Barnes &amp; Nobl...|3.8073394495412844|
+----------+--------------------+------------------+
only showing top 5 rows



In [20]:
prodname_merged_df.write.csv(prodname_output, header=True)

## Задача 3. Поиск среднего рейтинга по названию продукта
Напишите программу, которая выводит средний рейтинги всех продуктов из "prodname_avg_rating.csv", в названии которых встречается введенное при запуске слово: ProdId,Name,Rating

In [21]:
search_text = "%adapter%"

In [22]:
search_input = "hdfs://namenode:9000/prodname_avg_rating"

In [23]:
search_output = "hdfs://namenode:9000/searched_avg_rating"

In [24]:
search_df = spark.read.load(path=search_input, format="csv", header=True)

In [25]:
search_df.show(5)

+----------+--------------------+------------------+
|      asin|               title|      avg(overall)|
+----------+--------------------+------------------+
|B0019H58GY|THREE BOTTLES - R...| 4.666666666666667|
|B0019HGU0M|HP Pavilion TX251...|4.0588235294117645|
|B0019HGTLC|Samsung Touch Of ...| 4.277777777777778|
|B0019HDAP0|Samsung T200HD 20...|4.3478260869565215|
|B0019HLE7Q|Kingwin USB 2.0 t...| 4.157894736842105|
+----------+--------------------+------------------+
only showing top 5 rows



In [26]:
search_result_df = search_df.filter(F.lower(F.col("title")).like(search_text))

In [27]:
search_result_df.show(5)

+----------+--------------------+------------------+
|      asin|               title|      avg(overall)|
+----------+--------------------+------------------+
|B0019HLE7Q|Kingwin USB 2.0 t...| 4.157894736842105|
|B0019SI266|2.4GHz Bluetooth ...|               3.6|
|B0019RVX3Q|Xgear 3.5mm Headp...|3.5714285714285716|
|B0019SVUQK|eForCity USB Soun...|            3.6875|
|B0019SSSMY|EASYCAP USB 2.0 A...|2.8735632183908044|
+----------+--------------------+------------------+
only showing top 5 rows



In [28]:
search_result_df.write.csv(search_output, header=True)