In [1]:
import math

from pyspark.sql.functions import *
from pyspark.sql import * 
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import StringType, LongType, DoubleType
from pyspark.sql.functions import length, size

In [2]:
# Создание сессии

spark =  SparkSession.builder \
    .master("local") \
    .appName("TF-IDF") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# 1. Загрузка данных

In [3]:
# Чтение данных в DataFrame

df = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .csv("tripadvisor_hotel_reviews.csv")

# 2. Очистка и токенизация отзывов

In [4]:
# Удаление колонки с рейтингом

df_droped = df.drop(col("Rating"))

In [5]:
# Функция для очистки и токенизации текста

def text_cleaner(string: str):
    string = trim(string)
    reg_rep_1 = regexp_replace(string, "[-+.^':;,&#!?/\\|*|[0-9]]", "")
    reg_rep_2 = regexp_replace(reg_rep_1, "  ", " ")
    return split(reg_rep_2, " ")

In [6]:
# Токенизация текста

df_clean = df_droped.select("Review", text_cleaner(col("Review")).alias("document")).drop("Review")

In [7]:
df_clean.show()

+--------------------+
|            document|
+--------------------+
|[nice, hotel, exp...|
|[ok, nothing, spe...|
|[nice, rooms, not...|
|[unique, great, s...|
|[great, stay, gre...|
|[love, monaco, st...|
|[cozy, stay, rain...|
|[excellent, staff...|
|[hotel, stayed, h...|
|[excellent, staye...|
|[poor, value, sta...|
|[nice, value, sea...|
|[nice, hotel, goo...|
|[nice, hotel, not...|
|[great, hotel, ni...|
|[horrible, custom...|
|[disappointed, sa...|
|[fantastic, stay,...|
|[good, choice, ho...|
|[hmmmmm, say, rea...|
+--------------------+
only showing top 20 rows



In [8]:
# Токены одного из отзывов

df_clean.select("document").collect()[1]

Row(document=['ok', 'nothing', 'special', 'charge', 'diamond', 'member', 'hilton', 'decided', 'chain', 'shot', 'th', 'anniversary', 'seattle', 'start', 'booked', 'suite', 'paid', 'extra', 'website', 'description', 'not', 'suite', 'bedroom', 'bathroom', 'standard', 'hotel', 'room', 'took', 'printed', 'reservation', 'desk', 'showed', 'said', 'things', 'like', 'tv', 'couch', 'ect', 'desk', 'clerk', 'told', 'oh', 'mixed', 'suites', 'description', 'kimpton', 'website', 'sorry', 'free', 'breakfast', 'got', 'kidding', 'embassy', 'suits', 'sitting', 'room', 'bathroom', 'bedroom', 'unlike', 'kimpton', 'calls', 'suite', 'day', 'stay', 'offer', 'correct', 'false', 'advertising', 'send', 'kimpton', 'preferred', 'guest', 'website', 'email', 'asking', 'failure', 'provide', 'suite', 'advertised', 'website', 'reservation', 'description', 'furnished', 'hard', 'copy', 'reservation', 'printout', 'website', 'desk', 'manager', 'duty', 'did', 'not', 'reply', 'solution', 'send', 'email', 'trip', 'guest', 'su

# 3. Добавление уникального id для каждого документа

In [9]:
# Назначение id в диапазоне [0, count(tokens) - 1]

df_token = df_clean.withColumn("doc_id", monotonically_increasing_id())

In [10]:
df_token.show()

+--------------------+------+
|            document|doc_id|
+--------------------+------+
|[nice, hotel, exp...|     0|
|[ok, nothing, spe...|     1|
|[nice, rooms, not...|     2|
|[unique, great, s...|     3|
|[great, stay, gre...|     4|
|[love, monaco, st...|     5|
|[cozy, stay, rain...|     6|
|[excellent, staff...|     7|
|[hotel, stayed, h...|     8|
|[excellent, staye...|     9|
|[poor, value, sta...|    10|
|[nice, value, sea...|    11|
|[nice, hotel, goo...|    12|
|[nice, hotel, not...|    13|
|[great, hotel, ni...|    14|
|[horrible, custom...|    15|
|[disappointed, sa...|    16|
|[fantastic, stay,...|    17|
|[good, choice, ho...|    18|
|[hmmmmm, say, rea...|    19|
+--------------------+------+
only showing top 20 rows



# 4. Сопоставление каждого токена с документом, в котором он встречается

In [11]:
# "Разворот" каждого документа на его токены

columns = df_token.select("document", "doc_id", explode_outer("document").alias("token"))

In [12]:
columns.show()

+--------------------+------+-----------+
|            document|doc_id|      token|
+--------------------+------+-----------+
|[nice, hotel, exp...|     0|       nice|
|[nice, hotel, exp...|     0|      hotel|
|[nice, hotel, exp...|     0|  expensive|
|[nice, hotel, exp...|     0|    parking|
|[nice, hotel, exp...|     0|        got|
|[nice, hotel, exp...|     0|       good|
|[nice, hotel, exp...|     0|       deal|
|[nice, hotel, exp...|     0|       stay|
|[nice, hotel, exp...|     0|      hotel|
|[nice, hotel, exp...|     0|anniversary|
|[nice, hotel, exp...|     0|    arrived|
|[nice, hotel, exp...|     0|       late|
|[nice, hotel, exp...|     0|    evening|
|[nice, hotel, exp...|     0|       took|
|[nice, hotel, exp...|     0|     advice|
|[nice, hotel, exp...|     0|   previous|
|[nice, hotel, exp...|     0|    reviews|
|[nice, hotel, exp...|     0|        did|
|[nice, hotel, exp...|     0|      valet|
|[nice, hotel, exp...|     0|    parking|
+--------------------+------+-----

# 5. Подсчёт количества вхождений слов в документы (TF)

In [13]:
df_tf = columns.groupBy("doc_id", "token").agg((count("document")).alias("tf"))

In [14]:
df_tf.show()

+------+----------+---+
|doc_id|     token| tf|
+------+----------+---+
|     0|      room|  3|
|     1|    better|  2|
|     6|attractive|  1|
|     6|  positive|  1|
|     7| concierge|  2|
|    10|        nt|  2|
|    12|     clean|  1|
|    12|   concert|  1|
|    15|      stay|  2|
|    16|      desk|  6|
|    19|       bed|  1|
|    30| excellent|  1|
|    32|    really|  1|
|    44| cringeshe|  1|
|    46|      mind|  1|
|    51|    pretty|  1|
|    52|     steer|  1|
|    54|     tacky|  1|
|    58|   staying|  1|
|    63|       etc|  1|
+------+----------+---+
only showing top 20 rows



# 6. Подсчет  числа документов из коллекции, в которых встречается данное слово (DF)

In [15]:
# Расчёт df

df_df = columns.groupBy("token").agg(countDistinct("doc_id").alias("df"))

In [16]:
df_df.show()

+-----------+----+
|      token|  df|
+-----------+----+
|     travel|1330|
|   priority|  75|
|        art| 312|
|       hope| 541|
|      pools| 819|
|    barrier| 164|
| lifeguards|  19|
|  standards| 578|
|   everyday| 514|
|  timetable|   6|
|     online| 360|
|  traveling| 436|
|hereevening|   1|
|     gloria|   6|
|     waters|  96|
|  connected| 154|
|  smokefree|   8|
|handicapped|  24|
|    jewelry|  56|
|   cabinish|   1|
+-----------+----+
only showing top 20 rows



## 6.1 Отбор 100 самых встречаемых слов

In [17]:
# Сортировка по убыванию параметра df и отбор 100 слов с самым большим df

df_df_100 = df_df.orderBy(col("df").desc()).limit(100)

In [18]:
df_df_100.show()

+---------+-----+
|    token|   df|
+---------+-----+
|    hotel|16323|
|     room|14052|
|      not|12124|
|    staff|11526|
|    great|11021|
|     stay|10095|
|     good| 9280|
|   stayed| 8552|
|       nt| 8383|
|    rooms| 8339|
| location| 8170|
|     just| 7736|
|    clean| 7650|
|     nice| 7419|
|      did| 7207|
|breakfast| 7113|
|       no| 6818|
|    night| 6476|
|  service| 6231|
|     time| 6151|
+---------+-----+
only showing top 20 rows



# 7. Расчет инверсии частоты, с которой каждое слово встречается в документах коллекции (IDF)

In [19]:
# Функция для расчета idf, по df и количеству документов - d

text_count = df_droped.count()

def calc_idf(df, d=text_count):
    return math.log((d + 1) / (df + 1))

In [20]:
# Создание udf-функции из функции для расчета idf

calc_idf_udf = udf(calc_idf, DoubleType())

In [21]:
# Расчёт idf

df_idf = df_df_100.withColumn("idf", calc_idf_udf(col("df")))

In [22]:
df_idf.show()

+---------+-----+-------------------+
|    token|   df|                idf|
+---------+-----+-------------------+
|    hotel|16323| 0.2273981485333966|
|     room|14052|0.37719866995834783|
|      not|12124| 0.5247651292534089|
|    staff|11526| 0.5753424564580804|
|    great|11021| 0.6201412906159123|
|     stay|10095| 0.7078952602780986|
|     good| 9280| 0.7920652664637001|
|   stayed| 8552| 0.8737524674783113|
|       nt| 8383| 0.8937094384982694|
|    rooms| 8339| 0.8989713497063005|
| location| 8170| 0.9194432656743855|
|     just| 7736| 0.9740205505096384|
|    clean| 7650|  0.985198207827241|
|     nice| 7419| 1.0158555088976666|
|      did| 7207|  1.044843045770919|
|breakfast| 7113| 1.0579698925506924|
|       no| 6818|   1.10032173253844|
|    night| 6476| 1.1517771258761758|
|  service| 6231| 1.1903372575085087|
|     time| 6151|  1.203257333873613|
+---------+-----+-------------------+
only showing top 20 rows



# 8. Расчёт TF-IDF

In [23]:
# Получение только тех документов и tf, где встречаются 100 самых частых токенов

df_tf_100 = df_tf.join(df_df_100, "token")

In [24]:
df_tf_100.show()

+---------+------+---+-----+
|    token|doc_id| tf|   df|
+---------+------+---+-----+
|     room|     0|  3|14052|
|   better|     1|  2| 3244|
|       nt|    10|  2| 8383|
|    clean|    12|  1| 7650|
|     stay|    15|  2|10095|
|     desk|    16|  6| 3192|
|      bed|    19|  1| 3781|
|excellent|    30|  1| 4423|
|   really|    32|  1| 4883|
|   street|    70|  1| 2768|
|      bed|    80|  1| 3781|
|      day|   116|  1| 5778|
|     just|   125|  1| 7736|
|   little|   133|  1| 4403|
|   hotels|   146|  1| 3243|
|      did|   153|  1| 7207|
|  walking|   173|  1| 2567|
|       no|   176|  1| 6818|
| bathroom|   189|  2| 4253|
|      day|   207|  1| 5778|
+---------+------+---+-----+
only showing top 20 rows



In [25]:
# Расчёт tf-idf

df_tf_idf = df_tf_100.join(df_idf, ["token"], "left").withColumn("tf_idf", col("tf") * col("idf"))

In [26]:
df_tf_idf.show()

+---------+------+---+-----+-----+-------------------+------------------+
|    token|doc_id| tf|   df|   df|                idf|            tf_idf|
+---------+------+---+-----+-----+-------------------+------------------+
|     room|     0|  3|14052|14052|0.37719866995834783|1.1315960098750435|
|   better|     1|  2| 3244| 3244| 1.8429192159209025| 3.685838431841805|
|       nt|    10|  2| 8383| 8383| 0.8937094384982694| 1.787418876996539|
|    clean|    12|  1| 7650| 7650|  0.985198207827241| 0.985198207827241|
|     stay|    15|  2|10095|10095| 0.7078952602780986|1.4157905205561971|
|     desk|    16|  6| 3192| 3192|  1.859073652344311|11.154441914065867|
|      bed|    19|  1| 3781| 3781|   1.68978159584069|  1.68978159584069|
|excellent|    30|  1| 4423| 4423| 1.5329903018569222|1.5329903018569222|
|   really|    32|  1| 4883| 4883| 1.4340700098284975|1.4340700098284975|
|   street|    70|  1| 2768| 2768|  2.001548321888131| 2.001548321888131|
|      bed|    80|  1| 3781| 3781|   1

In [27]:
# Удаление ненужных столбцов

df_tf_idf = df_tf_idf.drop("df", "tf", "idf")

In [28]:
df_tf_idf.show()

+---------+------+------------------+
|    token|doc_id|            tf_idf|
+---------+------+------------------+
|     room|     0|1.1315960098750435|
|   better|     1| 3.685838431841805|
|       nt|    10| 1.787418876996539|
|    clean|    12| 0.985198207827241|
|     stay|    15|1.4157905205561971|
|     desk|    16|11.154441914065867|
|      bed|    19|  1.68978159584069|
|excellent|    30|1.5329903018569222|
|   really|    32|1.4340700098284975|
|   street|    70| 2.001548321888131|
|      bed|    80|  1.68978159584069|
|      day|   116|1.2658039087413149|
|     just|   125|0.9740205505096384|
|   little|   133|1.5375213472165223|
|   hotels|   146|1.8432274298237894|
|      did|   153| 1.044843045770919|
|  walking|   173| 2.076907180249241|
|       no|   176|  1.10032173253844|
| bathroom|   189| 3.144349698597821|
|      day|   207|1.2658039087413149|
+---------+------+------------------+
only showing top 20 rows



In [29]:
# Объединение датасетов, для получения столбца с токенами документов

df_join = df_token.join(df_tf_idf, df_token.doc_id == df_tf_idf.doc_id)

In [30]:
# Добавление колонки с количество токеном, для перевода tf в относительную шкалу

df_join = df_join.withColumn('length_tokens', size("document"))

In [31]:
# Деление tf-idf на количество токенов

df_join = df_join.withColumn('tf_idf_right', col("tf_idf") / col("length_tokens"))

In [32]:
df_join.show()

+--------------------+------+---------+------+-------------------+-------------+--------------------+
|            document|doc_id|    token|doc_id|             tf_idf|length_tokens|        tf_idf_right|
+--------------------+------+---------+------+-------------------+-------------+--------------------+
|[loved, stayed, w...|    26|    staff|    26| 0.5753424564580804|           42| 0.01369862991566858|
|[loved, stayed, w...|    26|    loved|    26| 2.0945858964180353|           42|0.049871092771857986|
|[loved, stayed, w...|    26|     days|    26|  2.013172136018414|           42| 0.04793266990520033|
|[loved, stayed, w...|    26| bathroom|    26| 1.5721748492989105|           42| 0.03743273450711691|
|[loved, stayed, w...|    26|     stay|    26| 0.7078952602780986|           42| 0.01685464905424044|
|[loved, stayed, w...|    26|    think|    26| 2.1681756943621897|           42|0.051623230818147375|
|[loved, stayed, w...|    26|  service|    26| 1.1903372575085087|           42|0.

In [33]:
# Удаление лишних столбцов

df_join = df_join.drop("length_tokens", "doc_id", "doc_id", "tf_idf", "df")

In [34]:
df_join.show()

+--------------------+---------+--------------------+
|            document|    token|        tf_idf_right|
+--------------------+---------+--------------------+
|[loved, stayed, w...|    staff| 0.01369862991566858|
|[loved, stayed, w...|    loved|0.049871092771857986|
|[loved, stayed, w...|     days| 0.04793266990520033|
|[loved, stayed, w...| bathroom| 0.03743273450711691|
|[loved, stayed, w...|     stay| 0.01685464905424044|
|[loved, stayed, w...|    think|0.051623230818147375|
|[loved, stayed, w...|  service|0.028341363274012113|
|[loved, stayed, w...|    hotel|0.005414241631747538|
|[loved, stayed, w...|    clean| 0.02345710018636288|
|[loved, stayed, w...|     room|0.008980920713293997|
|[loved, stayed, w...| friendly| 0.05978452460835978|
|[loved, stayed, w...|    going| 0.04925625869779894|
|[loved, stayed, w...|   really| 0.03414452404353566|
|[loved, stayed, w...|wonderful| 0.09089691247158928|
|[loved, stayed, w...|   stayed| 0.02080363017805503|
|[good, hotel, not...| locat

In [35]:
# Пайвотинг таблицы, для перевода каждого из 100 слов в столбцы

df_final = df_join.groupBy("document").pivot("token").agg(first("tf_idf_right"))

In [36]:
# Заполнение значений "null" на 0

df_final = df_final.na.fill(0)

In [37]:
df_final.show(2)

+--------------------+---+-------------------+-------+-------------------+---+--------+-------------------+---------+-------------------+----+----+------+--------------------+---+------+--------------------+------+-----+----+-------------------+-----+--------------------+---+----+----------+----+---+--------------------+--------------------+-------------------+-----+----+------------------+--------+-----+--------------------+---+--------------------+-------+--------------------+--------------------+--------------------+-------------------+--------------------+------+-----+-------------------+---+-----+------+----+-------+-------+----+-------------------+--------------------+-------------------+------+---+--------------------+--------------------+------+-------+-------------------+--------------------+-----+-------+-----+-----+--------------------+---------+------+----------+--------------------+-------+--------------------+--------------------+--------------------+-------------------+-

In [38]:
df_final.select("document", "hotel", "location").show()

+--------------------+--------------------+--------------------+
|            document|               hotel|            location|
+--------------------+--------------------+--------------------+
|[fantastic, value...|0.004502933634324685| 0.00910339867004342|
|[nice, hotel, int...| 0.02014920303460476|                 0.0|
|[really, unique, ...|0.007579938284446554|0.015324054427906426|
|[neat, comfortabl...|0.014212384283337287|                 0.0|
|[average, problem...|3.954750409276462...|                 0.0|
|[okay, not, great...|                 0.0|                 0.0|
|[little, flashy, ...|                 0.0| 0.02138240152731129|
|[nice, place, jus...|0.008976242705265655|0.006048968853120958|
|[good, trip, stay...| 0.00879814265158975|                 0.0|
|[clean, bit, far,...|0.009345129391783422| 0.01259511322841624|
|[solid, hotel, sp...|0.010336279478790756|                 0.0|
|[dump, christmas,...|                 0.0|                 0.0|
|[need, tender, lo...|0.0