In [None]:
# Mount Google Drive to the colab machine
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# restart
import os
os.kill(os.getpid(), 9)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, mean
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import numpy as np

In [None]:
# 创建 SparkSession
spark = SparkSession.builder.master("local[*]").appName("MovieDataProcessing").getOrCreate()

# 读入数据
file_path = "./gdrive/MyDrive/CS5344_AY2425Sem2_Project/final_1.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

In [None]:
# 分区
df=df.repartition(4)

In [None]:
df.rdd.getNumPartitions()

4

In [None]:
df.show(5,truncate=False)

+-----+----------+------------+----------+-------+-------+-------+------------+-------------+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
df.count()

72449

In [None]:
# 删除 poster_features 为 "[]"
df_cleaned = df.filter(~(col("poster_features") == "[]"))

In [None]:
df_cleaned.count()

72134

In [None]:
# 填补 revenue 和 budget 中为 0 的值为非零平均值
for col_name in ["revenue", "budget"]:
    avg_value = df_cleaned.filter(col(col_name) > 0).select(mean(col_name)).first()[0]
    df_cleaned = df_cleaned.withColumn(
        col_name,
        when(col(col_name) == 0, avg_value).otherwise(col(col_name))
    )

In [None]:
#周期性编码月份
from pyspark.sql.functions import sin, cos, lit
df_cleaned = df_cleaned.withColumn("month_sin", sin(2 * np.pi * col("release_month") / 12)) \
               .withColumn("month_cos", cos(2 * np.pi * col("release_month") / 12))

In [None]:
# 将str变为vector
from pyspark.ml.linalg import Vectors, VectorUDT

def str_to_vector(s):
    return Vectors.dense([float(x.strip()) if x.strip() != '' else 0.0 for x in s.strip("[]").split(",")])

# 注册为 UDF
str_to_vector_udf = udf(str_to_vector, VectorUDT())

df_vectorized = df_cleaned \
    .withColumn("poster_vec", str_to_vector_udf("poster_features")) \
    .withColumn("title_vec", str_to_vector_udf("title_vec_str")) \
    .withColumn("text_vec", str_to_vector_udf("processed_text_vec_str"))

#df_vectorized.select("poster_vec", "title_vec", "text_vec").show(1, truncate=False)

# 0. dataset split & normalize

In [None]:
# 拆分数据集
train_df, test_df = df_vectorized.randomSplit([0.8, 0.2], seed=42)

In [None]:
# 标准化指定的列
cols_to_normalize = ["popularity", "vote_count", "revenue", "budget", "runtime", "release_year"]

# 拼接所有需要标准化的特征
assembler = VectorAssembler(inputCols=cols_to_normalize, outputCol="features_vec")
train_df_assembled = assembler.transform(train_df)
test_df_assembled = assembler.transform(test_df)

# 标准化
scaler = StandardScaler(inputCol="features_vec", outputCol="features_scaled", withMean=True, withStd=True)
scaler_model = scaler.fit(train_df_assembled)

train_df_scaled = scaler_model.transform(train_df_assembled)
test_df_scaled = scaler_model.transform(test_df_assembled)

# df_scaled.show(5)

In [None]:
'''
from pyspark.sql.functions import col, mean, stddev

# 获取均值和标准差
stats = train_df_scaled.select(mean("popularity"), stddev("popularity")).first()
mean_pop, std_pop = stats

# 直接生成数值型的 scaled 标签列
train_df_scaled = train_df_scaled.withColumn("popularity_scaled", (col("popularity") - mean_pop) / std_pop)
test_df_scaled = test_df_scaled.withColumn("popularity_scaled", (col("popularity") - mean_pop) / std_pop)
'''

In [None]:
# multimodal feature

# 所有需要拼接的列
all_features = [
    "features_scaled","month_sin","month_cos",
    "original_language_en",
    "poster_vec", "title_vec", "text_vec"
]

# 创建拼接器
final_assembler = VectorAssembler(inputCols=all_features, outputCol="multimodal_features")

# 应用到 DataFrame 上
train_multimodal_df = final_assembler.transform(train_df_scaled)
test_multimodal_df = final_assembler.transform(test_df_scaled)

train_multimodal_df.select("multimodal_features").show(3, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# 查看训练集的第一个样本的特征维度
train_multimodal_df.select("multimodal_features").head(1)[0][0].size #（6+2+1+256+100+50)

415

In [None]:
# delete image&text

# 所有需要拼接的列
features_no_image_text = [
    "features_scaled","month_sin","month_cos",
    "original_language_en",
]

# 创建拼接器
final_assembler1 = VectorAssembler(inputCols=features_no_image_text, outputCol="unimodal_features")

# 应用到 DataFrame 上
train_unimodal_df = final_assembler1.transform(train_df_scaled)
test_unimodal_df = final_assembler1.transform(test_df_scaled)

train_unimodal_df.select("unimodal_features").show(3, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|unimodal_features                                                                                                                                                           |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.008283826457737748,-0.020238418427619663,-3.3469069546529094E-4,9.567172383198434E-4,-0.5647522759837335,-0.37239990273513285,-0.8660254037844386,0.5000000000000001,0.0]|
|[-0.07287257058153666,-0.004876169312001701,-3.3469069546529094E-4,9.567172383198434E-4,-0.5360764931698466,-0.45230292471113237,-0.8660254037844386,0.5000000000000001,0.0]|
|[0.7453778933653715,3.8381117278739016,-0.5013581272915679,-1.191920645493475,0.7543337334550623,-2.2501209191711213,0.86602

In [None]:
# 查看训练集的第一个样本的特征维度
train_unimodal_df.select("unimodal_features").head(1)[0][0].size #（6+2+1)

9

In [None]:
'''
from pyspark.sql.functions import size, col
from pyspark.ml.functions import vector_to_array  # Spark 3.0+
train_final_df.select(size(vector_to_array("multimodal_features")).alias("dim")).distinct().show()
'''

+---+
|dim|
+---+
|415|
+---+



# 1.LinearRegression

## 1.0 multimodel

In [None]:
from pyspark.ml.regression import LinearRegression

# Define model
lr1 = LinearRegression(
    featuresCol="multimodal_features",
    labelCol="vote_average",
    elasticNetParam=1.0,   # 完全是 L1 正则（Lasso）
    regParam=0.01           # 正则化强度，可调整
    )

# Train model
lr_multimodal = lr1.fit(train_multimodal_df)

In [None]:
# Get the summary of the model
training_summary = lr_multimodal.summary

# Print coefficients, intercept, and R-squared
print("Coefficients:", lr_multimodal.coefficients)
print("Intercept:", lr_multimodal.intercept)
print("R2:", training_summary.r2)
print("Root Mean Squared Error (RMSE):", training_summary.rootMeanSquaredError)

Coefficients: [0.036211815430778685,0.11743222248983863,-0.003899850918107383,-0.018551812698645784,0.07838609839154106,-0.028264609567966174,-0.013103886982044531,0.0,-0.2973257976825829,0.022959337935982864,0.0,0.0,0.0,0.0,-0.07715871185563951,0.0,-0.01881869493085992,0.0,0.00011857106026220511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0686264242979499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07290943300011157,0.048716766162528694,0.02798853563432283,-0.029627891431983286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018749891732566346,0.0,0.002257370670881792,-0.054901946022847314,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07008903582546283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.04954187883982267,0.0,0.0,0.10130769888731447,0.0,0.0,0.0,-0.0004285825469469185,0.0,-0.02676771278015212,0.0,0.0,-0.037154590168950646,0.02860230777920719,0.0,0.0,0.0,-0.02437351850824598,0.03715765208472191,-0.01825881817359696,0.0,-0.023135194388867276,0.0,0.009143473849690274,0.0,0.0,0.0,-0.034385827410991276,0.0,0

In [None]:
import numpy as np
weight = lr_multimodal.coefficients

scalar_weight = weight[:9].tolist()

# Poster vector features (from 9 to 264 coefficients)
poster_coefficients = weight[9:265]
l2_norm0 = np.linalg.norm(poster_coefficients, ord=2)
scalar_weight.append(l2_norm0)

# Title vector features (from 264 to 364 coefficients)
title_coefficients = weight[264:364]
l2_norm1 = np.linalg.norm(title_coefficients, ord=2)
scalar_weight.append(l2_norm1)

# Text vector features (from 364 to 414 coefficients)
text_coefficients = weight[364:]
l2_norm2 = np.linalg.norm(text_coefficients, ord=2)
scalar_weight.append(l2_norm2)

feature_names = [
    "popularity", "vote_count", "revenue", "budget",
    "runtime", "release_year", "month_sin", "month_cos",
    "original_language_en", "poster_vec", "title_vec", "text_vec"
]

# 将系数和特征名配对，并按绝对值排序
feature_importance = sorted(
    zip(feature_names, scalar_weight),
    key=lambda x: abs(x[1]),
    reverse=True
)

# 输出排序后的结果
print("特征重要性（按系数绝对值降序排列）：")
for name, coef in feature_importance:
    print(f"{name:>20}: {coef: .6f}")

特征重要性（按系数绝对值降序排列）：
            text_vec:  3.038327
           title_vec:  0.960210
          poster_vec:  0.416626
original_language_en: -0.297326
          vote_count:  0.117432
             runtime:  0.078386
          popularity:  0.036212
        release_year: -0.028265
              budget: -0.018552
           month_sin: -0.013104
             revenue: -0.003900
           month_cos:  0.000000


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
# Predict on test data
predictions_lr = lr_multimodal.transform(test_multimodal_df)

# Show predictions
# predictions_lr.select("vote_average", "prediction").show(5)

# Initialize RegressionEvaluator for MSE and RMSE
evaluator_mse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="mse")
evaluator_rmse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="rmse")

# Calculate MSE and RMSE on the predictions
mse_lr = evaluator_mse.evaluate(predictions_lr)
rmse_lr = evaluator_rmse.evaluate(predictions_lr)

# Print results
print(f"Mean Squared Error (MSE): {mse_lr}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr}")

Mean Squared Error (MSE): 0.7914551761844484
Root Mean Squared Error (RMSE): 0.8896376656731932


## 1.1 unimodel

In [None]:
#delete image&text
# Define model
lr2 = LinearRegression(
    featuresCol="unimodal_features",
    labelCol="vote_average",
    )

# Train model
lr_unimodal = lr2.fit(train_unimodal_df)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
# Predict on test data
predictions_lr_uni = lr_unimodal.transform(test_unimodal_df)

# Show predictions
# predictions_lr.select("vote_average", "prediction").show(5)

# Initialize RegressionEvaluator for MSE and RMSE
evaluator_mse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="mse")
evaluator_rmse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="rmse")

# Calculate MSE and RMSE on the predictions
mse_lr_uni = evaluator_mse.evaluate(predictions_lr_uni)
rmse_lr_uni = evaluator_rmse.evaluate(predictions_lr_uni)

# Print results
print(f"Mean Squared Error (MSE): {mse_lr_uni}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr_uni}")

Mean Squared Error (MSE): 0.976209359918536
Root Mean Squared Error (RMSE): 0.9880330763281845


In [None]:
# Get the summary of the model
training_summary1 = lr_unimodal.summary

# Print coefficients, intercept, and R-squared
print("Coefficients:", lr_unimodal.coefficients)
print("Intercept:", lr_unimodal.intercept)
print("R2:", training_summary1.r2)
print("Root Mean Squared Error (RMSE):", training_summary1.rootMeanSquaredError)

Coefficients: [0.06014786454508966,0.22574706295732525,-0.09031579696779088,-0.00034711962888952364,0.0811020079807665,-0.053038781819447614,-0.02795187177412439,0.019999829673116847,-0.30546081791494106]
Intercept: 6.334300964794619
R2: 0.06510122962903742
Root Mean Squared Error (RMSE): 0.9760099710251111


In [None]:
coefficients = lr_unimodal.coefficients

feature_names = [
    "popularity",
    "vote_count",
    "revenue",
    "budget",
    "runtime",
    "release_year",
    "month_sin",
    "month_cos",
    "original_language_en"
]

# 将系数和特征名配对，并按绝对值排序
feature_importance = sorted(
    zip(feature_names, coefficients),
    key=lambda x: abs(x[1]),
    reverse=True
)

# 输出排序后的结果
print("特征重要性（按系数绝对值降序排列）：")
for name, coef in feature_importance:
    print(f"{name:>20}: {coef: .6f}")

特征重要性（按系数绝对值降序排列）：
original_language_en: -0.305461
          vote_count:  0.225747
             revenue: -0.090316
             runtime:  0.081102
          popularity:  0.060148
        release_year: -0.053039
           month_sin: -0.027952
           month_cos:  0.020000
              budget: -0.000347


## 2.RandomForestRegressor

## 2.0 multimodel

In [None]:
from pyspark.ml.regression import RandomForestRegressor

# 训练 Random Forest 模型
rf1 = RandomForestRegressor(featuresCol="multimodal_features", labelCol="vote_average")
rf_multimodel = rf1.fit(train_multimodal_df)

In [None]:
predictions = rf_multimodel.transform(test_multimodal_df)

from pyspark.ml.evaluation import RegressionEvaluator

# Initialize RegressionEvaluator for MSE and RMSE
evaluator_mse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="mse")
evaluator_rmse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="rmse")

# Calculate MSE and RMSE on the predictions
mse = evaluator_mse.evaluate(predictions)
rmse = evaluator_rmse.evaluate(predictions)

# Print results
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error (MSE): 0.8205453202957366
Root Mean Squared Error (RMSE): 0.9058395665324719


In [None]:
feature_structure = {
    # 标量特征 (9个)
    "popularity": 1,
    "vote_count": 1,
    "revenue": 1,
    "budget": 1,
    "runtime": 1,
    "release_year": 1,
    "month_sin": 1,
    "month_cos": 1,
    "original_language_en": 1,
    # 向量特征
    "poster_vec": 256,  # 9-264
    "title_vec": 100,    # 265-364
    "text_vec": 50      # 365-414
}

In [None]:
from pyspark.ml.linalg import SparseVector
import numpy as np

# 获取特征重要性
feature_importances = rf_multimodel.featureImportances
# 你的稀疏向量
sv = feature_importances  # 你的实际数据

def parse_feature_importance(sv, feature_structure):
    start_idx = 0
    results = {}

    for feature, dim in feature_structure.items():
        end_idx = start_idx + dim
        indices = []
        values = []

        # 提取该特征对应的所有维度
        for idx, val in zip(sv.indices, sv.values):
            if start_idx <= idx < end_idx:
                relative_idx = idx - start_idx
                indices.append(relative_idx)
                values.append(val)

        # 存储结果
        if dim == 1:  # 标量特征
            results[feature] = values[0] if values else 0.0
        else:         # 向量特征
            l2_norm = np.linalg.norm(values) if values else 0.0  # 计算 L2 范数
            results[feature] = {
                "size": dim,
                "non_zero_count": len(values),
                "avg_importance": np.mean(values) if values else 0.0,
                "max_importance": max(values) if values else 0.0,
                "l2_norm": l2_norm,
                "indices": indices,
                "values": values
            }
        start_idx = end_idx

    return results

# 执行解析
importance_dict = parse_feature_importance(sv, feature_structure)

In [None]:
print("=== 标量特征重要性 ===")
scalar_importances = {feature: importance_dict[feature] for feature in list(feature_structure.keys())[:9]}

sorted_importances = sorted(scalar_importances.items(), key=lambda x: abs(x[1]), reverse=True)

# Print with consistent formatting
max_name_length = max(len(name) for name in scalar_importances.keys())
for feature, importance in sorted_importances:
    print(f"{feature:>{max_name_length}}: {importance:.4f}")

print("\n=== 向量特征统计 ===")
for feature in ["poster_vec", "title_vec", "text_vec"]:
    stats = importance_dict[feature]
    print(f"{feature:>10}: avg={stats['avg_importance']:.4f}, "
          f"max={stats['max_importance']:.4f}, "
          f"l2={stats['l2_norm']:.4f}, "
          f"non-zero={stats['non_zero_count']}/{stats['size']} "
          f"({stats['non_zero_count']/stats['size']:.1%})")

=== 标量特征重要性 ===
             runtime: 0.2522
          vote_count: 0.0876
original_language_en: 0.0629
        release_year: 0.0264
          popularity: 0.0125
              budget: 0.0045
           month_sin: 0.0000
             revenue: 0.0000
           month_cos: 0.0000

=== 向量特征统计 ===
poster_vec: avg=0.0029, max=0.0516, l2=0.0714, non-zero=66/256 (25.8%)
 title_vec: avg=0.0001, max=0.0002, l2=0.0004, non-zero=8/100 (8.0%)
  text_vec: avg=0.0103, max=0.1059, l2=0.1331, non-zero=35/50 (70.0%)


# 2.1 unimodel

In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# 训练 Random Forest 模型
rf2 = RandomForestRegressor(featuresCol="unimodal_features", labelCol="vote_average")
rf_unimodel = rf2.fit(train_unimodal_df)

#预测
predictions_uni = rf_unimodel.transform(test_unimodal_df)

# Initialize RegressionEvaluator for MSE and RMSE
evaluator_mse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="mse")
evaluator_rmse = RegressionEvaluator(labelCol="vote_average", predictionCol="prediction", metricName="rmse")

# Calculate MSE and RMSE on the predictions
mse_uni = evaluator_mse.evaluate(predictions_uni)
rmse_uni = evaluator_rmse.evaluate(predictions_uni)

# Print results
print(f"Mean Squared Error (MSE): {mse_uni}")
print(f"Root Mean Squared Error (RMSE): {rmse_uni}")

Mean Squared Error (MSE): 0.8725616180272369
Root Mean Squared Error (RMSE): 0.9341100674049269


In [None]:
rf_unimodel.featureImportances.indices



array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=int32)

In [None]:
feature_index_mapping = {
    0: "popularity",
    1: "vote_count",
    2: "revenue",
    3: "budget",
    4: "runtime",
    5: "release_year",
    6: "month_sin",
    7: "month_cos",
    8: "original_language_en"
}

# SparseVector data
sv_data = rf_unimodel.featureImportances

# Create list of (feature_name, importance) tuples
feature_importances_uni = [(feature_index_mapping[idx], sv_data.values[i]) for i, idx in enumerate(sv_data.indices)]

# Sort by absolute importance (descending)
sorted_importances_uni = sorted(feature_importances_uni, key=lambda x: abs(x[1]), reverse=True)

# Print results
max_name_length = max(len(name) for name in feature_index_mapping.values())
for feature, importance in sorted_importances_uni:
    print(f"{feature:>{max_name_length}}: {importance:.4f}")

             runtime: 0.3329
          vote_count: 0.2344
          popularity: 0.1378
        release_year: 0.1212
original_language_en: 0.0975
              budget: 0.0589
             revenue: 0.0151
           month_sin: 0.0012
           month_cos: 0.0011
