In [1]:
import sys, getpass

import pandas as pd
import numpy as np

sys.path.insert(0, '/home/jovyan/k8s-qiwidata/git_projects/ml')
sys.path.insert(0, '/home/jovyan/k8s-qiwidata/git_projects/ml/scoring')
from scoring_main_v2 import prepare_data_for_scoring, scoring_bank 
from prepare_data_for_scoring_v8 import ParquetCreator
import utils_ml
from qiwi_ml import cv_auto_iterations, base_cv
from utils_spark import stop_and_init_spark_session

sys.path.insert(0, '/home/jovyan/k8s-qiwidata/git_projects/ml/scoring_pipeline/customization_platform')
import feature_collection

In [14]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [3]:
spark

In [6]:
df = spark.read.csv("/user/vi.nechaeva/data.csv", header=True)

In [16]:
# сколько раз встречается слово в каждом документе / кол-во слов в каждом документе
# compute_TF
compute_TF = df.select(['Review'])\
.withColumn('RowID', F.monotonically_increasing_id())\
.withColumn("Review", F.lower(F.col("Review")))\
.withColumn("Review", F.regexp_replace(F.col("Review"), "[*|.|'|,]", ""))\
.withColumn("Review", F.split(F.col("Review")," "))\
.withColumn('Token', F.explode(F.col('Review')))\
.groupBy([F.col('RowID'), F.col('Token')])\
.agg({"Token": 'count'})\
.withColumnRenamed('count(Token)','TokensInDoc')\
.filter(F.col('Token') != '')\
.orderBy(F.col('RowID'))\
.withColumn("SumInDoc", F.sum(F.col("TokensInDoc")).over(Window.partitionBy("RowID")))\
.withColumn('TF',F.col('TokensInDoc') / F.col('SumInDoc'))

compute_TF.show()

+-----+-----------+-----------+--------+--------------------+
|RowID|      Token|TokensInDoc|SumInDoc|                  TF|
+-----+-----------+-----------+--------+--------------------+
|    0|       room|          3|      87|0.034482758620689655|
|    0|      night|          2|      87|0.022988505747126436|
|    0|       took|          1|      87|0.011494252873563218|
|    0|        bed|          1|      87|0.011494252873563218|
|    0|      taken|          1|      87|0.011494252873563218|
|    0|     people|          1|      87|0.011494252873563218|
|    0|    staying|          1|      87|0.011494252873563218|
|    0|      music|          1|      87|0.011494252873563218|
|    0|       high|          1|      87|0.011494252873563218|
|    0|        got|          1|      87|0.011494252873563218|
|    0|      doors|          1|      87|0.011494252873563218|
|    0|   products|          1|      87|0.011494252873563218|
|    0|       woke|          1|      87|0.011494252873563218|
|    0| 

In [29]:
# логарифм ( кол-во документов / кол-во слов в корпусе)
compute_IDF = compute_TF\
.groupby('Token')\
.agg({'RowID': 'count'})\
.withColumnRenamed('count(RowID)','TokenInCorpus')\
.withColumn('Documents', F.lit(df.count()))\
.withColumn('IDF', F.log2(F.col('Documents') / F.col('TokenInCorpus')))

# compute_IDF.show()# логарифм ( кол-во документов / кол-во слов в корпусе)

+---------------+-------------+---------+------------------+
|          Token|TokenInCorpus|Documents|               IDF|
+---------------+-------------+---------+------------------+
|         travel|         1330|    20491| 3.945492241506015|
|         online|          317|    20491|  6.01436374175516|
|          pools|          818|    20491| 4.646745738952818|
|      traveling|          436|    20491|5.5545184471176405|
|        jewelry|           56|    20491| 8.515347849836964|
|          spoil|           59|    20491| 8.440059722532727|
|        barrier|          163|    20491| 6.973974617663489|
|      standards|          575|    20491|  5.15528462606283|
|           jamb|            2|    20491|13.322702771894567|
|           hope|          540|    20491| 5.245887174843737|
|reconditionning|            1|    20491|14.322702771894567|
|    handicapped|           24|    20491| 9.737740271173411|
|            675|            1|    20491|14.322702771894567|
|    formalities|       

In [31]:
tf_idf = compute_TF\
.join(compute_IDF, on='Token', how='right')\
.na.drop()\
.withColumn('TFIDF', F.col('TF') * F.col('IDF'))\
.select(['Token', 'RowID', 'TFIDF'])\
.orderBy(F.col('TFIDF').desc())\
.limit(100)


In [32]:
tf_idf\
.groupBy('Token')\
.pivot('RowID')\
.agg(F.first(F.col('TFIDF')))\
.show()

+------------+----+----+-----------------+----+----+----+----+----+----+------------------+----+------------------+------------------+------------------+----+----+------------------+----+----+------------------+----+----+----+-----------------+------------------+----+----+----+----+----+----+----+----+----+----+----+------------------+----+----+----+----+-----+-----+-----+-----------------+------------------+-----+-----+-----+------------------+-----+------------------+-----+-----+-----------------+-----+-----+-----+------------------+-----+-----+-----+-----+------------------+-----------------+-----+-----+-----+-----+-----+-----+-----+------------------+-----+------------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|       Token|  69| 217|              480| 527| 936|1096|1188|1501|1566|              1881|2419|              2473|              2536|              2687|3355|4164|              4271|4524|4740|              5096|5174|5186|5192|             5642|             