In [1]:
!pip install pyspark findspark



In [5]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')
import findspark
findspark.init()

In [6]:
from pyspark.ml.classification import RandomForestClassificationModel
from pyspark.sql import SparkSession

# Khởi tạo SparkSession
spark = SparkSession.builder.appName("rfmodel").master("local").getOrCreate()
model_path = "hdfs://namenode:9000/user/root/random_forest_model"
rf_model = RandomForestClassificationModel.load(model_path)

In [9]:
df_test = spark.read.csv("hdfs://namenode:9000/user/root/test.csv", header=True, inferSchema=True)
df_test.show(5)

+---------------+--------------------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+------------------+--------------+
|    Artist Name|          Track Name|Popularity|danceability|energy| key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_in min/ms|time_signature|
+---------------+--------------------+----------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+------------------+--------------+
|    David Bowie|Space Oddity - 20...|      73.0|        0.31| 0.403|NULL| -13.664|   1|     0.0326|      0.0726|         9.27E-5|   0.139|  0.466| 134.48|          318027.0|             4|
|    Crimson Sun| Essence of Creation|      34.0|       0.511| 0.955| 1.0|  -5.059|   1|      0.129|      4.0E-4|          8.7E-6|   0.263|  0.291|151.937|          220413.0|             4|
|           P!nk|    Raise Your Glass|      78.0| 

In [12]:
from pyspark.sql.functions import when, col,sqrt, cbrt,log, expr
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
new_columns = [col.replace(" ", "_").lower() for col in df_test.columns]
df_test = df_test.toDF(*new_columns)
df_test = df_test.withColumn(
    "duration_in_min/ms",
    when(col("duration_in_min/ms") < 30, col("duration_in_min/ms") * 60000)
    .otherwise(col("duration_in_min/ms"))
    )
    # xoa trung lap
columns_to_check=[col for col in df_test.columns if col != "class"]
df_test = df_test.dropDuplicates(subset=columns_to_check)
    # thay null = median 
df_test = df_test.withColumn("popularity", col("popularity").cast("float"))
popularity_median = df_test.approxQuantile("popularity", [0.5], 0.001)[0]
instrumentalness_median = df_test.approxQuantile("instrumentalness", [0.5], 0.001)[0]
df_test = df_test.fillna({
    "popularity": popularity_median,
    "instrumentalness": instrumentalness_median
})
df_test = df_test.fillna({"key": -1})

In [16]:
df_test = df_test.drop("energy")
epsilon = 1e-6
df_test =    df_test.withColumn("duration_in_ms_trans", sqrt(col("duration_in_min/ms")))
df_test =    df_test.withColumn("loudness_trans", cbrt(col("loudness")))
df_test =    df_test.withColumn("speechiness_trans", expr(f"1 / (speechiness + {epsilon})"))
df_test =    df_test.withColumn("acousticness_trans", cbrt(col("acousticness")))
df_test =    df_test.withColumn("instrumentalness_trans", log(col("instrumentalness") + epsilon))
df_test =    df_test.withColumn("liveness_trans", log(col("liveness") + epsilon))
df_test =    df_test.withColumn("tempo_trans", cbrt(col("tempo")))
columns = ["artist_name", "track_name"]
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in columns]
pipeline = Pipeline(stages=indexers)
df_test= pipeline.fit(df_test).transform(df_test)

In [17]:
df_test.printSchema()

root
 |-- artist_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: float (nullable = false)
 |-- danceability: double (nullable = true)
 |-- key: double (nullable = false)
 |-- loudness: double (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = false)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- duration_in_min/ms: double (nullable = true)
 |-- time_signature: integer (nullable = true)
 |-- duration_in_ms_trans: double (nullable = true)
 |-- loudness_trans: double (nullable = true)
 |-- speechiness_trans: double (nullable = true)
 |-- acousticness_trans: double (nullable = true)
 |-- instrumentalness_trans: double (nullable = true)
 |-- liveness_trans: double (nullable = true)
 |-- tempo_trans: double (nullable = true)
 |-- artist_name_index

In [20]:
df_test=df_test.drop("artist_name","track_name","artist_name_index","track_name_index")
df_test = df_test.drop("duration_in_min/ms","instrumentalness","loudness","speechiness","liveness","tempo","acousticness")
indexer = StringIndexer(inputCol="danceability", outputCol="danceability_index")
df_test = indexer.fit(df_test).transform(df_test)
df_test=df_test.drop("danceability")

In [21]:
from pyspark.ml.feature import VectorAssembler, StandardScaler 
from pyspark.sql.functions import col
feature_cols = [col for col in df_test.columns if col not in "class"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features_unscaled")
df_vec = assembler.transform(df_test)
scaler = StandardScaler(inputCol="features_unscaled",outputCol="features",withStd=True,withMean=True)
scaler_model=scaler.fit(df_vec)
df_scaled = scaler_model.transform(df_vec)

In [22]:
predictions = rf_model.transform(df_scaled)
predictions.show()

+----------+----+----+-------+--------------+--------------------+-------------------+------------------+--------------------+----------------------+-------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|popularity| key|mode|valence|time_signature|duration_in_ms_trans|     loudness_trans| speechiness_trans|  acousticness_trans|instrumentalness_trans|     liveness_trans|       tempo_trans|danceability_index|   features_unscaled|            features|       rawPrediction|         probability|prediction|
+----------+----+----+-------+--------------+--------------------+-------------------+------------------+--------------------+----------------------+-------------------+------------------+------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      61.0|10.0|   0|  0.544|             4|  401.95024567724795| -1.678033413122095|3.831

In [24]:
predictions.groupBy("prediction").count().orderBy("count").show()

+----------+-----+
|prediction|count|
+----------+-----+
|       2.0|   30|
|       1.0|   33|
|       9.0|   71|
|       3.0|  134|
|       7.0|  233|
|      10.0|  453|
|       8.0|  911|
|       6.0| 1004|
|       5.0| 4519|
+----------+-----+

