1、标签，rating大于3为正样本，其余的为负样本 \
2、将每个用户timestamp最大的前5个样本作为测试集，其余的样本作为训练集 \
  注意：测试集有接近500的用户都是负样本，再评估排序模型时将这部分用户去掉，或者不去除，看效果如何。\
3、生成负样本

In [2]:
from pyspark.sql import SparkSession  
import pyspark.sql.functions as F
from pyspark.sql.window import Window  
from pyspark.ml.feature import StringIndexer, VectorAssembler


import warnings
warnings.filterwarnings("ignore")

# 初始化SparkSession
spark = SparkSession.builder \
    .appName("MovieLens Recommendation System") \
    .getOrCreate()

24/08/31 14:30:24 WARN Utils: Your hostname, yvjie-Lenovo-Legion-Y7000-2019-1050 resolves to a loopback address: 127.0.1.1; using 172.24.70.50 instead (on interface wlp0s20f3)
24/08/31 14:30:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/08/31 14:30:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [35]:
path_to_users_data = "./ml-1m/users.dat"
path_to_movies_data = "./ml-1m/movies.dat"
path_to_ratings_data = "./ml-1m/ratings.dat"

users_column_names = ["user_id", "gender", "age", "occupation", "zip_code"]
movies_column_names = ["movie_id", "title", "genres"]
ratings_column_names = ["user_id", "movie_id", "rating", "timestamp"]

user_df = spark.read.csv(path_to_users_data,
                          sep='::',
                          header=False,
                          inferSchema=True)
movie_df = spark.read.csv(path_to_movies_data,
                           sep='::',
                           header=False,
                           inferSchema=True)
rating_df = spark.read.csv(path_to_ratings_data,
                            sep='::',
                            header=False,
                            inferSchema=True)
user_df = user_df.toDF(*users_column_names).drop("zip_code")
movie_df = movie_df.toDF(*movies_column_names)
rating_df = rating_df.toDF(*ratings_column_names)

                                                                                

In [17]:
import matplotlib.pyplot as plt

ratings_pandas = rating_df.select('rating').toPandas()
# 使用matplotlib绘制直方图
plt.hist(ratings_pandas['rating'], bins=20, alpha=0.75)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [36]:
rating_df = rating_df.join(movie_df, 'movie_id', 'left')

# 使用 StringIndexer 对 movie_id 进行重新编码
indexer = StringIndexer(inputCol="movie_id", outputCol="movie_id_encoded")
indexed_model = indexer.fit(rating_df)
rating_df_encoded = indexed_model.transform(rating_df)
rating_df = rating_df_encoded.drop(
    *["movie_id", 'title']).withColumnRenamed("movie_id_encoded", "movie_id")
rating_df = rating_df.withColumn("movie_id", F.col("movie_id").cast("integer"))

# 获得重新编码的电影数据集
movie_df = rating_df.select(['movie_id', 'genres']).distinct()

# 打标签
rating_df = rating_df.withColumn("label", F.when(F.col("rating") > 2, 1).otherwise(
    0)).select(['user_id', 'movie_id', 'label', 'rating', 'timestamp'])

In [37]:
# 创建窗口规范，按user_id分区，按timestamp降序排序
window_spec = Window.partitionBy("user_id").orderBy(F.desc("timestamp"))
# 添加行号
data_spark_df = rating_df.withColumn("row_num", F.row_number().over(window_spec))
test_df = data_spark_df.filter(F.col("row_num") <= 5).drop(*['timestamp', 'row_num'])
val_df = data_spark_df.filter((F.col("row_num") > 5) & (F.col("row_num") <= 10)).drop(*['timestamp', 'row_num'])
train_df = data_spark_df.filter(F.col("row_num") > 10).drop(*['timestamp', 'row_num'])

In [10]:
users_with_label_1 = val_df.filter(F.col("label") == 0).groupBy("user_id").count()
users_without_label_1 = users_with_label_1.filter(F.col("count") == 5).select("user_id").collect()
users_without_label_1_list = [row['user_id'] for row in users_without_label_1]

                                                                                

In [12]:
import json

with open('./data/users_without_label_1_list.json', 'w') as file:
    json.dump(users_without_label_1_list, file)

# # 从JSON文件读取列表
# with open('./data/users_without_label_1_list.json', 'r') as file:
#     my_list = json.load(file)



In [38]:
# from pyspark.ml.linalg import Vectors, VectorUDT


# # 创建一个用户评分向量的UDF
# def create_rating_vector(df):
#     # 初始化一个长度为5的向量，表示评分1到5
#     rating_vector = [0] * 5
#     # 遍历评分，对相应的位置进行计数
#     for rating in df:
#         rating_vector[rating - 1] += 1
#     # 返回向量
#     return Vectors.dense(rating_vector)

# 创建一个 UDF (User Defined Function) 来进行评分的分段
def rating_segment(avg_rating):
    if 1 <= avg_rating < 1.5:
        return 1
    elif 1.5 <= avg_rating < 2:
        return 2
    elif 2 <= avg_rating < 2.5:
        return 3
    elif 2.5 <= avg_rating < 3:
        return 4
    elif 3 <= avg_rating < 3.5:
        return 5
    elif 3.5 <= avg_rating < 4:
        return 6
    elif 4 <= avg_rating < 4.5:
        return 7
    elif 4.5 <= avg_rating <= 5:
        return 8
    else:
        return None  # 处理无效的评分

# 注册UDF
# create_rating_vector_udf = F.udf(create_rating_vector, VectorUDT())
segmented_rating_udf = F.udf(rating_segment)

window_user = Window.partitionBy("user_id")
user_avg_rating_df = rating_df.withColumn('user_avg_rating', F.avg('rating').over(window_user))
user_avg_rating_df = user_avg_rating_df.withColumn('user_avg_rating', segmented_rating_udf(F.col('user_avg_rating')))

window_movie = Window.partitionBy("movie_id")
movie_avg_rating_df = rating_df.withColumn('movie_avg_rating', F.avg('rating').over(window_movie))
movie_avg_rating_df = movie_avg_rating_df.withColumn('movie_avg_rating', segmented_rating_udf(F.col('movie_avg_rating')))

# # 聚合每个用户的评分
# user_rating_vector_df = train_df.groupBy("user_id").agg(
#     create_rating_vector_udf(F.collect_list("rating")).alias("user_rating_vector"),
#     F.count("rating").alias("ratings_count")
# )

# # 计算最终的用户评分向量
# def normalize_vector(vector, count):
#     return Vectors.dense([x / count for x in vector])

# normalize_vector_udf = F.udf(normalize_vector, VectorUDT())

# user_rating_vector_df = user_rating_vector_df.withColumn(
#     "normalized_user_rating_vector",
#     normalize_vector_udf(F.col("user_rating_vector"), F.col("ratings_count"))
# )

# # 同理，计算电影评分向量
# movie_rating_vector_df = train_df.groupBy("movie_id").agg(
#     create_rating_vector_udf(F.collect_list("rating")).alias("movie_rating_vector"),
#     F.count("rating").alias("ratings_count")
# )

# movie_rating_vector_df = movie_rating_vector_df.withColumn(
#     "normalized_movie_rating_vector",
#     normalize_vector_udf(F.col("movie_rating_vector"), F.col("ratings_count"))
# )

user_avg_rating_df = user_avg_rating_df.select(['user_id', 'user_avg_rating']).distinct()
movie_avg_rating_df = movie_avg_rating_df.select(['movie_id', 'movie_avg_rating']).distinct()


In [39]:
# user_df = user_df.join(user_rating_vector_df, 'user_id', 'left')
# movie_df = movie_df.join(movie_rating_vector_df, 'movie_id', 'left')
user_df = user_df.join(user_avg_rating_df, 'user_id', 'left')
movie_df = movie_df.join(movie_avg_rating_df, 'movie_id', 'left')

In [15]:
# # 定义UDF来创建一个长度为5的零向量
# def create_zero_vector():
#     return Vectors.dense([0, 0, 0, 0, 0])

# # 注册UDF
# zero_vector_udf = F.udf(create_zero_vector, VectorUDT())

# # 使用UDF来填充normalized_movie_rating_vector列中为null的行
# movie_df = movie_df.withColumn(
#     "normalized_movie_rating_vector",
#     F.when(F.col("normalized_movie_rating_vector").isNull(), zero_vector_udf()).otherwise(
#         F.col("normalized_movie_rating_vector")
#     )
# )

In [41]:
train_df.write.mode('overwrite').parquet('./data/train_df.parquet')
val_df.write.mode('overwrite').parquet('./data/val_df.parquet')
test_df.write.mode('overwrite').parquet('./data/test_df.parquet')



24/08/31 13:05:12 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers




24/08/31 13:05:13 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

In [42]:
def OneHotEncoder(df, col_name):
    unique_col = df.select(col_name).distinct().rdd.flatMap(lambda x: x).collect()
    unique_col.sort()
    for item in unique_col:
        df = df.withColumn(str(item), F.when(F.col(col_name)==item, 1).otherwise(0))
    # 使用VectorAssembler将多个二进制列合并成一个向量列
    assembler = VectorAssembler(inputCols=[str(item) for item in unique_col],
                                outputCol=col_name+'Vector')
    df = assembler.transform(df)
    # 删除中间创建的one-hot编码列
    for item in unique_col:
        df = df.drop(str(item))
    df = df.drop(col_name)
    return df

def MultiHotEncoder(df, col_name):
    # 对genres进行多热编码
    # 拆分genres列
    col_split = df.withColumn(col_name, F.explode(F.split(F.col(col_name), "\\|")))
    # 获取所有genre的列表
    unique_col = col_split.select(col_name).distinct().rdd.flatMap(lambda x: x).collect()
    # 对每个genre进行multi-hot编码
    for item in unique_col:
        if isinstance(item, str):
            df = df.withColumn(item, F.when(F.col(col_name).contains(item), 1).otherwise(0))
    # 使用VectorAssembler将多个二进制列合并成一个向量列
    assembler = VectorAssembler(inputCols=[item for item in unique_col if isinstance(item, str)],
                                outputCol=col_name+'Vector')
    df = assembler.transform(df)
    # 删除中间创建的one-hot编码列
    for item in unique_col:
        if isinstance(item, str):
            df = df.drop(item)
    df = df.drop(col_name)
    return df
    
def user_data_processing(df):
    # 将gender改为二值变量
    df = df.withColumn("gender",
                                F.when(df["gender"] == "F", 0).otherwise(1))
    # 对age进行独热编码
    df = OneHotEncoder(df, 'age')
    # 对occupation进行独热编码
    df = OneHotEncoder(df, 'occupation')
    df = OneHotEncoder(df, 'user_avg_rating')

    return df

def movie_data_processing(df):
    # 对genres进行独热编码
    df = MultiHotEncoder(df, 'genres')
    df = OneHotEncoder(df, 'movie_avg_rating')
    return df

    

# 假设data_df是合并后的DataFrame
user_df = user_data_processing(user_df)
movie_df = movie_data_processing(movie_df)


                                                                                

In [45]:
user_df.write.mode('overwrite').parquet('./data/user_df.parquet')
movie_df.write.mode('overwrite').parquet('./data/movie_df.parquet')

                                                                                

In [3]:
user_df = spark.read.parquet('./data/user_df.parquet')
movie_df = spark.read.parquet('./data/movie_df.parquet')

                                                                                

In [4]:
import pickle

# 定义一个函数来将向量转换为字符串列表
def vector_to_list(vector):
    return vector.toArray().tolist()

# 将DataFrame转换为RDD，并将每行数据转换为(user_id, [其他列值])的元组
user_rdd = user_df.rdd.map(lambda row: (
    row['user_id'], [row['user_id'], row['gender']] 
    + vector_to_list(row['ageVector']) 
    + vector_to_list(row['occupationVector']) 
    + vector_to_list(row['user_avg_ratingVector'])
    ))

movie_rdd = movie_df.rdd.map(lambda row: (
    row['movie_id'], [row['movie_id']]
    + vector_to_list(row['genresVector'])
    + vector_to_list(row['movie_avg_ratingVector'])
    ))

# 收集RDD中的所有元素，并转换为字典
user_dict = user_rdd.collectAsMap()
movie_dict = movie_rdd.collectAsMap()

# 保存字典到文件
with open('./data/user_FeatureVector_dict.pickle', 'wb') as f:
    pickle.dump(user_dict, f)

with open('./data/movie_FeatureVector_dict.pickle', 'wb') as f:
    pickle.dump(movie_dict, f)

                                                                                

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window


# 初始化 SparkSession
spark = SparkSession.builder.appName("NegativeSampling").getOrCreate()


24/08/31 02:29:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [47]:
train_df = spark.read.parquet('./data/train_df.parquet')
val_df = spark.read.parquet('./data/val_df.parquet')
df = train_df.union(val_df)

In [54]:
def generate_negative_samples(train_df, sampling_ratio=2):
    all_user_df = train_df.select('user_id').distinct()
    all_movie_df = train_df.select('movie_id').distinct()
    all_user_movie_df  = all_user_df.select('user_id').crossJoin(all_movie_df)

    # 计算每个电影的点击频率
    movie_click_freq = train_df.groupBy("movie_id").count().withColumnRenamed("count", "click_count").select('movie_id', 'click_count')
    # 计算每个电影的权重（点击频率的0.75次幂）
    movie_click_freq = movie_click_freq.withColumn("sampling_weight", F.pow(F.col("click_count"), 0.75)).drop("click_count")

    # 计算每个用户的正负样本数量
    user_positive_movie = train_df.where(F.col('label')==1).select("user_id", "movie_id")
    user_positive_count = user_positive_movie.groupBy("user_id").count().withColumnRenamed("count", "positive_count")
    user_negative_movie = train_df.where(F.col('label')==0).select("user_id", "movie_id")
    user_negative_count = user_negative_movie.groupBy("user_id").count().withColumnRenamed("count", "negative_count")
    # 合并正样本和负样本的数量
    user_counts = user_positive_count.join(user_negative_count, on="user_id", how="outer").fillna(0)

    # 计算两倍正样本数减去负样本数，即每个用户应该生成的负样本数
    user_counts = user_counts.withColumn("user_negative_count", sampling_ratio * F.col("positive_count") - F.col("negative_count")).drop(*["positive_count", "negative_count"])

    # 所有用户-电影组合中，没有点击过的电影
    user_negative_movies = all_user_movie_df.join(user_positive_movie, on=["user_id", "movie_id"], how="left_anti")

    # 筛选用户负样本
    # 合并电影电影点击频率和权重以及用户应该生成负样本数
    user_negative_movies = user_negative_movies.join(user_counts, 'user_id', 'left').join(movie_click_freq, 'movie_id', 'left')

    # 窗口是按user_id进行分区，随机排序后，按weight逆序排列
    window_spec = Window.partitionBy("user_id").orderBy(F.rand(42), F.col("sampling_weight").desc())
    user_negative_movies = user_negative_movies.withColumn("row_num", F.row_number().over(window_spec))
    
    # 生成负样本
    user_negative_movies = user_negative_movies.filter(F.col("row_num") <=  F.col("user_negative_count"))  

    # 给生成的负样本打标签并合并到训练集中
    user_negative_movies = user_negative_movies.withColumn("label", F.lit(0)).withColumn('rating', F.lit(1)).select('user_id', 'movie_id', 'label', 'rating')
    train_df = train_df.union(user_negative_movies)

    return train_df

train_df = generate_negative_samples(train_df)

In [None]:
def generate_negative_samples(train_df, all_user_movie_df, user_positive_count, user_positive_movie, sampling_ratio=2):

    # 计算每个电影的点击频率
    movie_click_freq = train_df.groupBy("movie_id").count().withColumnRenamed("count", "click_count").select('movie_id', 'click_count')
    # 计算每个电影的权重（点击频率的0.75次幂）
    movie_click_freq = movie_click_freq.withColumn("sampling_weight", F.pow(F.col("click_count"), 0.75)).drop("click_count")

    # 计算每个用户的正负样本数量
    user_negative_movie = train_df.where(F.col('label')==0).select("user_id", "movie_id")
    user_negative_count = user_negative_movie.groupBy("user_id").count().withColumnRenamed("count", "negative_count")
    # 合并正样本和负样本的数量
    user_counts = user_positive_count.join(user_negative_count, on="user_id", how="outer").fillna(0)

    # 计算两倍正样本数减去负样本数，即每个用户应该生成的负样本数
    user_counts = user_counts.withColumn("user_negative_count", sampling_ratio * F.col("positive_count") - F.col("negative_count")).drop(*["positive_count", "negative_count"])

    # 所有用户-电影组合中，没有点击过的电影
    user_negative_movies = all_user_movie_df.join(user_positive_movie, on=["user_id", "movie_id"], how="left_anti")

    # 筛选用户负样本
    # 合并电影电影点击频率和权重以及用户应该生成负样本数
    user_negative_movies = user_negative_movies.join(user_counts, 'user_id', 'left').join(movie_click_freq, 'movie_id', 'left')

    # 窗口是按user_id进行分区，随机排序后，按weight逆序排列
    window_spec = Window.partitionBy("user_id").orderBy(F.rand(42), F.col("sampling_weight").desc())
    user_negative_movies = user_negative_movies.withColumn("row_num", F.row_number().over(window_spec))
    
    # 生成负样本
    user_negative_movies = user_negative_movies.filter(F.col("row_num") <=  F.col("user_negative_count"))  

    # 给生成的负样本打标签并合并到训练集中
    user_negative_movies = user_negative_movies.withColumn("label", F.lit(0)).withColumn('rating', F.lit(1)).select('user_id', 'movie_id', 'label', 'rating')
    train_df = train_df.union(user_negative_movies)

    return train_df


# 所有用户-电影组合
all_user_df = df.select('user_id').distinct()
all_movie_df = df.select('movie_id').distinct()
all_user_movie_df  = all_user_df.select('user_id').crossJoin(all_movie_df)

user_positive_movie = df.where(F.col('label')==1).select("user_id", "movie_id")
user_positive_count = val_df.where(F.col('label')==1).select("user_id", "movie_id").groupBy("user_id").count().withColumnRenamed("count", "positive_count")

val_df = generate_negative_samples(val_df, all_user_movie_df, user_positive_count, user_positive_movie)

In [50]:
train_df.write.mode('overwrite').parquet('./data/train_set.parquet')
val_df.write.mode('overwrite').parquet('./data/val_set.parquet')

In [5]:
spark.stop()