# Stage 3
## 1. Running Spark Apps

In [1]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = "team31"

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder \
    .appName("{} - spark ML".format(team)) \
    .master("yarn") \
    .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883") \
    .config("spark.sql.warehouse.dir", warehouse) \
    .config("spark.sql.avro.compression.codec", "snappy") \
    .enableHiveSupport() \
    .getOrCreate()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/site-packages/pyspark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.6/site-packages/pyspark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib64/python3.6/socket.py", line 586, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
db_name = f"{team}_projectdb"

spark.sql("SHOW DATABASES").show()
spark.sql(f"USE {db_name}").show()
spark.sql("SHOW TABLES").show()

## 2. Read Hive tables

In [None]:
print(spark.catalog.listTables(db_name))

In [None]:
mints = spark.read.format("avro").table(f"{db_name}.mints").select("token_id", "timestamp",
                                                                   "nft_address",
                                                                   "transaction_value")
mints.printSchema()
mints.show(5)
mints.count()

In [None]:
transfers = spark.read.format("avro").table(f"{db_name}.transfers").select("token_id", "timestamp",
                                                                           "transaction_value")
transfers.printSchema()
transfers.show(5)
transfers.count()

In [None]:
nfts = spark.read.format("avro").table(f"{db_name}.nfts").select("address", "name")
nfts.printSchema()
nfts.show(5)
nfts.count()

## 3. ML Modeling
### 3.1 Feature Selection & Feature Engineering

In [None]:
from pyspark.sql import functions as F

contract_stats = (
    mints
    .groupBy("nft_address")
    .agg(
        F.countDistinct("token_id").alias("num_tokens"),
        F.avg("transaction_value").alias("avg_mint_price"),
        F.max("transaction_value").alias("max_mint_price"),
        F.min("transaction_value").alias("min_mint_price"),
        F.min("timestamp").alias("first_mint_date"),
        F.max("timestamp").alias("last_mint_date"),
    )
)
contract_stats.show(5)

In [None]:
import pyspark.sql.types as T


def _get_last_n_txs(arr, n):
    return arr[-n:]


get_last_n_txs = F.udf(_get_last_n_txs, T.ArrayType(T.StructType([
    T.StructField("timestamp", T.LongType()),
    T.StructField("transaction_value", T.DoubleType()),
])))

LAST_N = 10

nft_history = (
    transfers
    .groupBy("token_id")
    .agg(
        F.sort_array(F.collect_list(F.struct("timestamp", "transaction_value"))).alias("tx_data"),
    )
    .withColumn("tx_data_except_last", F.expr("slice(tx_data, 1, size(tx_data) - 1)"))
    .withColumn("last_tx", F.element_at("tx_data", -1))
    .withColumn("last_tx_value", F.col("last_tx.transaction_value"))
    .drop("last_tx")
    .drop("tx_data")
    # Overall stats (excluding last tx)
    .withColumn("tx_count", F.size("tx_data_except_last"))
    .withColumn("min_tx_value", F.array_min("tx_data_except_last.transaction_value"))
    .withColumn("max_tx_value", F.array_max("tx_data_except_last.transaction_value"))
    .withColumn("first_tx_timestamp", F.element_at("tx_data_except_last.timestamp", 1))
    .withColumn("last_tx_timestamp", F.element_at("tx_data_except_last.timestamp", -1))
    .withColumn("last_n_transactions", get_last_n_txs("tx_data_except_last", F.lit(LAST_N)))
    .drop("tx_data_except_last")
    # Stats for the last N transactions
    .withColumn("min_n_tx_value", F.array_min("last_n_transactions.transaction_value"))
    .withColumn("max_n_tx_value", F.array_max("last_n_transactions.transaction_value"))
    .withColumn("first_n_tx_timestamp", F.element_at("last_n_transactions.timestamp", 1))
    .drop("last_n_transactions")
)
nft_history.show(5)

In [None]:
features = (
    mints.selectExpr("nft_address", "token_id", "timestamp as mint_timestamp",
                     "transaction_value as mint_tx_value")
    .join(contract_stats, on="nft_address", how="left")
    .join(nft_history, on="token_id", how="left")
    .join(nfts, on=(mints.nft_address == nfts.address), how="left")
    .drop("nft_address", "address")
)
print(features.count())

In [None]:
features.show(5)

In [None]:
features = features.withColumnRenamed("last_tx_value", "label")

In [None]:
filtered_features = features.na.drop()

In [None]:
filtered_features.count()

### 3.2 Feature Extraction Pipeline

In [None]:
date_cols = ["mint_timestamp", "first_mint_date", "last_mint_date", "first_tx_timestamp", "last_tx_timestamp", "first_n_tx_timestamp"]
text_cols = ["name"]
numerical_cols = ["num_tokens", "avg_mint_price", "max_mint_price", "min_mint_price", "tx_count", "min_tx_value", "max_tx_value", "min_n_tx_value", "max_n_tx_value"]

In [None]:
filtered_features_with_dt = filtered_features.selectExpr(
    *(f"from_unixtime({col}) as {col}" if col in date_cols else col for col in
      filtered_features.columns))
filtered_features_with_dt.show(5)

In [None]:
from pyspark.ml.feature import Tokenizer, Word2Vec, VectorAssembler, StandardScaler

# Collection name encoding
tokenizer = Tokenizer(inputCol=text_cols[0], outputCol=text_cols[0] + "_tokens")
word2vec = Word2Vec(vectorSize=16, minCount=1, inputCol=tokenizer.getOutputCol(),
                    outputCol=text_cols[0] + "_w2v")

In [None]:
import math

from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable


# Date encoding
class DateCyclicalEncodingTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable,
                                      DefaultParamsWritable):
    input_col = Param(Params._dummy(), "input_col", "input column name.",
                      typeConverter=TypeConverters.toString)
    output_col = Param(Params._dummy(), "output_col", "output column name.",
                       typeConverter=TypeConverters.toString)

    @keyword_only
    def __init__(self, input_col: str = "input", output_col: str = "output"):
        super(DateCyclicalEncodingTransformer, self).__init__()
        self._setDefault(input_col=None, output_col=None)
        kwargs = self._input_kwargs
        self.set_params(**kwargs)

    @keyword_only
    def set_params(self, input_col: str = "input", output_col: str = "output"):
        kwargs = self._input_kwargs
        self._set(**kwargs)

    def get_input_col(self):
        return self.getOrDefault(self.input_col)

    def get_output_col(self):
        return self.getOrDefault(self.output_col)

    def _transform(self, df):
        input_col = self.get_input_col()
        output_col = self.get_output_col()
        df = df.withColumn(output_col + "_year", F.year(F.col(input_col)))
        for col, val_count in (
                ("month", 12), ("day", 31), ("hour", 24), ("minute", 60), ("second", 60)):
            df = (
                df
                .withColumn(output_col + f"_{col}_sin",
                            F.sin(2 * math.pi * F.expr(f"{col}({input_col})") / val_count))
                .withColumn(
                    output_col + f"_{col}_cos",
                    F.cos(2 * math.pi * F.expr(f"{col}({input_col})") / val_count)
                )
            )
        return df

    def get_all_column_names(self):
        output_col = self.get_output_col()
        return [output_col + "_year"] + [output_col + f"_{col}_sin" for col in
                                         ("month", "day", "hour", "minute", "second")] + [
            output_col + f"_{col}_cos" for col in ("month", "day", "hour", "minute", "second")]

In [None]:
date_transformers = [DateCyclicalEncodingTransformer(input_col=col, output_col="encoded_" + col) for
                     col in date_cols]

In [None]:
cols_to_assemble = [text_cols[0] + "_w2v"] + sum(
    (dt.get_all_column_names() for dt in date_transformers), []) + numerical_cols

assembler = VectorAssembler(inputCols=cols_to_assemble, outputCol="raw_features")
scaler = StandardScaler(inputCol="raw_features", outputCol="features", withMean=True, withStd=True)

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[tokenizer, word2vec] + date_transformers + [assembler, scaler])

In [None]:
pipeline_model = pipeline.fit(filtered_features_with_dt)

In [None]:
transformed_features = pipeline_model.transform(filtered_features_with_dt).select("features",
                                                                                  "label")
transformed_features.show(5)

### 3.3 Train-Test Split

In [None]:
(train_data, test_data) = transformed_features.randomSplit([0.7, 0.3], seed=42)

In [None]:
train_data \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .format("json") \
    .save("project/data/train")

In [None]:
test_data \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .format("json") \
    .save("project/data/test")

In [3]:
!hdfs dfs -cat project/data/train/*.json > ../data/train.json

cat: `project/data/train/*.json': No such file or directory


In [4]:
!hdfs dfs -cat project/data/test/*.json > ../data/test.json

In [8]:
!gzip -c -9 ../data/train.json > ../data/train.json.gz

In [9]:
!gzip -c -9 ../data/test.json > ../data/test.json.gz

### 3.4 First Model

In [31]:
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors, VectorUDT

to_vector = F.udf(lambda vs: Vectors.dense(vs), VectorUDT())

train_data = spark.read.json("project/data/train").selectExpr("features.values as features",
                                                              "label").withColumn("features",
                                                                                  to_vector(
                                                                                      "features"))
test_data = spark.read.json("project/data/test").selectExpr("features.values as features",
                                                            "label").withColumn("features",
                                                                                to_vector(
                                                                                    "features"))

In [32]:
train_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)


In [33]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()

lr_model = lr.fit(train_data)

In [34]:
training_summary = lr_model.summary
print("Number of iterations:", training_summary.totalIterations)
print(f"RMSE: {training_summary.rootMeanSquaredError:.2f}")
print(f"R2 score: {training_summary.r2:.5f}")

Number of iterations: 64
RMSE: 1626087951446105856.00
R2 score: 0.01945


In [35]:
predictions = lr_model.transform(test_data)
predictions.show(5)

+--------------------+-------------+--------------------+
|            features|        label|          prediction|
+--------------------+-------------+--------------------+
|[-4.7039155659731...|1.29999995E16|2.517081314455933...|
|[-4.7039155659731...|          0.0|1.089464638468397...|
|[-4.7039155659731...|5.50000009E17|6.157348480292416E16|
|[-4.7039155659731...| 8.0000002E16|8.944542372326313...|
|[-4.7039155659731...| 5.8799999E17|5.286698603387879...|
+--------------------+-------------+--------------------+


In [36]:
from pyspark.ml.evaluation import RegressionEvaluator

rmse_evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction",
                                     metricName="rmse")
r2_evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

In [37]:
rmse = rmse_evaluator.evaluate(predictions)
r2_score = r2_evaluator.evaluate(predictions)
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2_score:.5f}")

RMSE: 1713754268755438592.00
R2 score: 0.02456


In [38]:
import numpy as np
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

grid = ParamGridBuilder()
grid = grid.addGrid(
    lr_model.aggregationDepth, [2, 3, 4]) \
    .addGrid(lr_model.regParam, np.logspace(1e-3, 1e-1)
             ) \
    .build()

cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=r2_evaluator,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

LinearRegressionModel: uid=LinearRegression_4ff1acd7785e, numFeatures=91

In [39]:
from pprint import pprint

pprint(bestModel.extractParamMap())

{Param(parent='LinearRegression_4ff1acd7785e', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_4ff1acd7785e', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LinearRegression_4ff1acd7785e', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_4ff1acd7785e', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35,
 Param(parent='LinearRegression_4ff1acd7785e', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 3,
 Param(parent='LinearRegression_4ff1acd7785e', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'): 'squaredError',
 Param(parent='LinearRegression_4ff1acd7785e', name='maxIter', doc='max number of iterations (>= 

In [40]:
best_predictions = bestModel.transform(test_data)
best_predictions.show(5)

+--------------------+-------------+--------------------+
|            features|        label|          prediction|
+--------------------+-------------+--------------------+
|[-4.7039155659731...|1.29999995E16|2.516882016039310...|
|[-4.7039155659731...|          0.0|1.089342662935894...|
|[-4.7039155659731...|5.50000009E17|6.155812403158675...|
|[-4.7039155659731...| 8.0000002E16|8.942848096075625...|
|[-4.7039155659731...| 5.8799999E17|5.286557720543809...|
+--------------------+-------------+--------------------+


In [41]:
best_rmse = rmse_evaluator.evaluate(best_predictions)
best_r2_score = r2_evaluator.evaluate(best_predictions)
print(f"Best RMSE: {best_rmse:.2f}")
print(f"Best R2 score: {best_r2_score:.5f}")

Best RMSE: 1713754246732851200.00
Best R2 score: 0.02456


In [42]:
print(f"R2 difference: {best_r2_score - r2_score:.10f}")

R2 difference: 0.0000000251


In [43]:
print(f"RMSE difference: {rmse - best_rmse:.2f}")
print(f"RMSE improvement: {(rmse - best_rmse) / rmse:.10%}")

RMSE difference: 22022587392.00
RMSE improvement: 0.0000012850%


In [64]:
print(f"Best RMSE, Gwei: {best_rmse / 10 ** 9:.5f}")
print(f"Best RMSE, ETH {best_rmse / 10 ** 18:.5f}")

Best RMSE, Gwei: 1713754246.73285
Best RMSE, ETH 1.71375


In [44]:
model1 = bestModel
model1.write().overwrite().save("project/models/model1")

In [45]:
!hdfs dfs -get project/models/model1 ../models/model1

In [46]:
best_predictions.select("label", "prediction") \
    .coalesce(1) \
    .write \
    .mode("overwrite") \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .save("project/output/model1_predictions.csv")

In [47]:
!hdfs dfs -cat project/output/model1_predictions.csv/*.csv > ../output/model1_predictions.csv

### 3.5 Second Model

In [48]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(seed=42)

gbt_model = gbt.fit(train_data)

In [49]:
predictions = gbt_model.transform(test_data)
predictions.show(5)

+--------------------+-------------+--------------------+
|            features|        label|          prediction|
+--------------------+-------------+--------------------+
|[-4.7039155659731...|1.29999995E16|1.441254833912304...|
|[-4.7039155659731...|          0.0|8.441977145012555...|
|[-4.7039155659731...|5.50000009E17|1.445230285815225...|
|[-4.7039155659731...| 8.0000002E16|1.444734375480371...|
|[-4.7039155659731...| 5.8799999E17|3.863333462341892...|
+--------------------+-------------+--------------------+


In [50]:
rmse = rmse_evaluator.evaluate(predictions)
r2_score = r2_evaluator.evaluate(predictions)
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2_score:.5f}")

RMSE: 863532489246270464.00
R2 score: 0.75234


In [51]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

grid = ParamGridBuilder()
grid = (grid.addGrid(gbt_model.maxDepth, [2, 5, 10, 15])
        .addGrid(gbt_model.maxBins, [32, 128])
        .build())

cv = CrossValidator(estimator=gbt,
                    estimatorParamMaps=grid,
                    evaluator=r2_evaluator,
                    parallelism=5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

GBTRegressionModel: uid=GBTRegressor_0a672806dfda, numTrees=20, numFeatures=91

In [52]:
from pprint import pprint

pprint(bestModel.extractParamMap())

{Param(parent='GBTRegressor_0a672806dfda', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number of features. When n is in the range (1, number of features), use n features). default = 'auto'"): 'all',
 Param(parent='GBTRegressor_0a672806dfda', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='GBTRegressor_0a672806dfda', name='cacheNodeIds', d

In [53]:
best_predictions = bestModel.transform(test_data)
best_predictions.show(5)

+--------------------+-------------+--------------------+
|            features|        label|          prediction|
+--------------------+-------------+--------------------+
|[-4.7039155659731...|1.29999995E16|1.592359448321518...|
|[-4.7039155659731...|          0.0|1.365887221229401...|
|[-4.7039155659731...|5.50000009E17|1.332288089284610...|
|[-4.7039155659731...| 8.0000002E16|2.203297352529219...|
|[-4.7039155659731...| 5.8799999E17|3.056335138725716...|
+--------------------+-------------+--------------------+


In [54]:
best_rmse_2 = rmse_evaluator.evaluate(best_predictions)
best_r2_score_2 = r2_evaluator.evaluate(best_predictions)
print(f"Best RMSE: {best_rmse_2:.2f}")
print(f"Best R2 score: {best_r2_score_2:.5f}")

Best RMSE: 831145378236056576.00
Best R2 score: 0.77057


In [55]:
print(f"R2 difference: {best_r2_score_2 - r2_score:.10f}")
print(f"R2 improvement: {(best_r2_score_2 - r2_score) / r2_score:.5%}")

R2 difference: 0.0182289766
R2 improvement: 2.42298%


In [56]:
print(f"RMSE difference: {rmse - best_rmse_2:.2f}")
print(f"RMSE improvement: {(rmse - best_rmse_2) / rmse:.10%}")

RMSE difference: 32387111010213888.00
RMSE improvement: 3.7505376362%


In [63]:
print(f"Best RMSE, Gwei: {best_rmse_2 / 10 ** 9:.5f}")
print(f"Best RMSE, ETH {best_rmse_2 / 10 ** 18:.5f}")

Best RMSE, Gwei: 831145378.23606
Best RMSE, ETH 0.83115


In [57]:
model2 = bestModel
model2.write().overwrite().save("project/models/model2")

In [58]:
!hdfs dfs -get project/models/model2 ../models/model2

In [59]:
best_predictions.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/model2_predictions.csv")

In [60]:
!hdfs dfs -cat project/output/model2_predictions.csv/*.csv > ../output/model2_predictions.csv

### 3.6 Comparing Models

In [65]:
model1_predictions = spark.read.csv("project/output/model1_predictions.csv", header=True, inferSchema=True)
model2_predictions = spark.read.csv("project/output/model2_predictions.csv", header=True, inferSchema=True)

In [66]:
best_rmse = rmse_evaluator.evaluate(model1_predictions)
best_rmse_2 = rmse_evaluator.evaluate(model2_predictions)

In [67]:
best_r2_score = r2_evaluator.evaluate(model1_predictions)
best_r2_score_2 = r2_evaluator.evaluate(model2_predictions)

In [68]:
models = [[str(model1), best_rmse, best_r2_score], [str(model2), best_rmse_2, best_r2_score_2]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2"])
df.show(truncate=False)

# Save it to HDFS
df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/output/evaluation.csv")

+------------------------------------------------------------------------------+----------------------+--------------------+
|model                                                                         |RMSE                  |R2                  |
+------------------------------------------------------------------------------+----------------------+--------------------+
|LinearRegressionModel: uid=LinearRegression_4ff1acd7785e, numFeatures=91      |1.71375424673285222E18|0.024561243833836843|
|GBTRegressionModel: uid=GBTRegressor_0a672806dfda, numTrees=20, numFeatures=91|8.3114537823604992E17 |0.7705664592343757  |
+------------------------------------------------------------------------------+----------------------+--------------------+


In [69]:
!hdfs dfs -cat project/output/evaluation.csv/*.csv > ../output/evaluation.csv