# Задание 4

Условие см. <a href="https://docs.google.com/document/d/1Y2DCQ0WxmLFtyu33ddhCQpAxhGmo8tjYhbuQ-sdlhuQ/edit#">здесь</a>

In [124]:
BIG_TRAIN = "/datasets/amazon/all_reviews_5_core_train.json" # - большой тренировочный датасет примерно на 20 миллионов записей.
SMALL_TRAIN = "/datasets/amazon/all_reviews_5_core_train_small.json" # - маленький тренировочный датасет на 1 миллион записей.
TEST = "/datasets/amazon/all_reviews_5_core_test_features.json" # - тестовый датасет на примерно 83 миллиона записей.
TRAIN = SMALL_TRAIN

In [3]:
import os
import sys

SPARK_HOME = "/usr/hdp/current/spark2-client"
PYSPARK_PYTHON = "/opt/conda/envs/dsenv/bin/python"
os.environ["PYSPARK_PYTHON"]= PYSPARK_PYTHON
os.environ["SPARK_HOME"] = SPARK_HOME

PYSPARK_HOME = os.path.join(SPARK_HOME, "python/lib")
sys.path.insert(0, os.path.join(PYSPARK_HOME, "py4j-0.10.7-src.zip"))
sys.path.insert(0, os.path.join(PYSPARK_HOME, "pyspark.zip"))

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel('WARN')


In [5]:
from pyspark.ml import Estimator, Transformer
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.types import *

In [6]:
schema = StructType([
    StructField("overall", IntegerType()),
    StructField("vote", StringType()),
    StructField("verified", StringType ()),
    StructField("reviewTime", StringType()),
    StructField("reviewerID", StringType()),
    StructField("asin", StringType()),
    StructField("reviewerName", StringType()),
    StructField("reviewText", StringType()),
    StructField("summary", StringType()),
    StructField("unixReviewTime", IntegerType())
])

Некоторые столбцы вряд ли понадобятся

In [125]:
dataset = spark.read.json(TRAIN)
dataset = dataset.drop("image", "reviewerName", "unixReviewTime").cache()
# id пока не дропаем - ещё пригодится

In [54]:
dataset.show(2, vertical=True)

-RECORD 0--------------------------
 asin       | B00005MDZ8           
 id         | 6500                 
 overall    | 5.0                  
 reviewText | quick shipping, g... 
 reviewTime | 10 23, 2014          
 reviewerID | AEZ4DZCUL021H        
 summary    | great product        
 verified   | true                 
 vote       | null                 
-RECORD 1--------------------------
 asin       | B000DZE0XK           
 id         | 42580                
 overall    | 5.0                  
 reviewText | Most delicious Ever! 
 reviewTime | 02 13, 2016          
 reviewerID | A3UPMJ5WQFHGLN       
 summary    | Five Stars           
 verified   | true                 
 vote       | null                 
only showing top 2 rows



In [23]:
dataset.schema

StructType(List(StructField(asin,StringType,true),StructField(overall,DoubleType,true),StructField(reviewText,StringType,true),StructField(reviewTime,StringType,true),StructField(reviewerID,StringType,true),StructField(summary,StringType,true),StructField(verified,BooleanType,true),StructField(vote,StringType,true)))

Здесь всяческие преобразования над датасетом. Как минимум - vote и verified конвертируем в Int,

In [126]:
from pyspark.sql import functions as f

verified = f.when(dataset.verified, 1).otherwise(0)
vote = f.when(dataset.vote.isNull(), 0).otherwise(dataset.vote.astype(IntegerType()))

dataset = dataset.withColumn("verified", verified)
dataset = dataset.withColumn("vote", vote)
dataset.show(1)

KeyboardInterrupt: 

Продолжим с цинизмом. Удалим всё лишнее, кроме чисел и summary

In [None]:
dataset = dataset.drop("asin", "reviewTime", "reviewerID", "summary").cache()
dataset.show(1)

## Обработаем тексты

In [None]:
from pyspark.ml.feature import *
tokenizer = RegexTokenizer(minTokenLength=2, pattern='\\W', inputCol="reviewText", outputCol="words")
dataset2 = tokenizer.transform(dataset)

In [None]:
dataset2.select("words").show(2, False, True)

In [None]:
dataset2.count()

In [None]:
%%time
cv = CountVectorizer(vocabSize=5 * 10e3, inputCol=tokenizer.getOutputCol(), outputCol="cv")
cv_model = cv.fit(dataset2)
dataset2 = cv_model.transform(dataset2)

In [None]:
dataset2.select("cv").show(2, False, True)

# ML

In [115]:
dataset2.groupBy("overall").count().show()

+-------+--------+
|overall|   count|
+-------+--------+
|    1.0| 1781920|
|    4.0| 3215822|
|    3.0| 1604121|
|    2.0| 1054679|
|    5.0|12996336|
+-------+--------+



In [116]:
test_frac = 0.8
train = dataset2.sampleBy("overall", fractions={1.0: test_frac, 2.0: test_frac,
                                                3.0: test_frac, 4.0: test_frac, 5.0: test_frac}, seed=5757)
test = dataset2.join(train, on="id", how="leftanti")

In [117]:
train.groupBy("overall").count().show(), test.groupBy("overall").count().show();

+-------+--------+
|overall|   count|
+-------+--------+
|    1.0| 1425736|
|    4.0| 2572497|
|    3.0| 1282758|
|    2.0|  843440|
|    5.0|10397881|
+-------+--------+

+-------+-------+
|overall|  count|
+-------+-------+
|    1.0| 356184|
|    4.0| 643325|
|    3.0| 321363|
|    2.0| 211239|
|    5.0|2598455|
+-------+-------+



In [118]:
train = train.drop("id").cache()
test = test.drop("id").coalesce(train.rdd.getNumPartitions()).cache()
train.rdd.getNumPartitions(), test.rdd.getNumPartitions()

(100, 100)

In [69]:
from pyspark.ml.regression import LinearRegression

In [119]:
lr = LinearRegression(featuresCol=cv.getOutputCol(), labelCol="overall", maxIter=25)

In [120]:
%%time
lr_model = lr.fit(train)

CPU times: user 90.2 ms, sys: 38.5 ms, total: 129 ms
Wall time: 9min 46s


In [121]:
predictions = lr_model.transform(test)

In [122]:
predictions.select("overall", "prediction").show(5, False, True)

-RECORD 0------------------------
 overall    | 1.0                
 prediction | 2.909890589945199  
-RECORD 1------------------------
 overall    | 5.0                
 prediction | 4.310055731339988  
-RECORD 2------------------------
 overall    | 5.0                
 prediction | 4.1937918153804254 
-RECORD 3------------------------
 overall    | 5.0                
 prediction | 4.2805712601666395 
-RECORD 4------------------------
 overall    | 4.0                
 prediction | 4.520410780292606  
only showing top 5 rows



In [123]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(
    labelCol="overall", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
rmse

0.9731297882932891

# Pipeline

In [127]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
    tokenizer,
    cv,
    lr
])

In [133]:
%%time

TRAIN = BIG_TRAIN
dataset = spark.read.json(TRAIN)
dataset = dataset.drop("image", "reviewerName", "unixReviewTime").cache()

verified = f.when(dataset.verified, 1).otherwise(0)
vote = f.when(dataset.vote.isNull(), 0).otherwise(dataset.vote.astype(IntegerType()))

dataset = dataset.withColumn("verified", verified)
dataset = dataset.withColumn("vote", vote)
dataset = dataset.drop("asin", "reviewTime", "reviewerID", "summary").cache()

test_frac = 0.8
train = dataset.sampleBy("overall", fractions={1.0: test_frac, 2.0: test_frac,
                                                3.0: test_frac, 4.0: test_frac, 5.0: test_frac}, seed=5757)
test = dataset.join(train, on="id", how="leftanti")

train = train.drop("id").cache()
test = test.drop("id").coalesce(train.rdd.getNumPartitions()).cache()

pipeline_model = pipeline.fit(train)
predictions = pipeline_model.transform(test)
evaluator = RegressionEvaluator(
    labelCol="overall", predictionCol="prediction", metricName="rmse")
evaluator.evaluate(predictions)

CPU times: user 158 ms, sys: 48.9 ms, total: 207 ms
Wall time: 9min 27s


0.9730072412432708

In [134]:
TRAIN

'/datasets/amazon/all_reviews_5_core_train.json'

In [135]:
pipeline_model.write().overwrite().save('pipeline_model.mdl')

In [None]:
sc.stop()