# 零基础入门推荐系统
[链接](https://tianchi.aliyun.com/competition/entrance/531842)。

# 初始化 & 读取数据集

In [1]:
import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, greatest, lit, abs
from configparser import ConfigParser
from pyspark.sql.functions import sum as spark_sum

# 设置 JDK
os.environ["JAVA_HOME"] = "/lib/jvm/java-17-openjdk-amd64"

print(pyspark.__version__)

# 添加 jar 包
jars_dir = "/home/jovyan/jars"
jars_list = [
    os.path.join(jars_dir, f) for f in os.listdir(jars_dir) if f.endswith(".jar")
]
jars_str = ",".join(jars_list)
print(jars_str)

# 读取 MinIO 配置
config_path = os.path.expanduser("~/.minioconfig")
parser = ConfigParser()
parser.read(config_path)
endpoint = parser.get("Credentials", "endpoint")
access_key_id = parser.get("Credentials", "accessKeyID")
access_key_secret = parser.get("Credentials", "accessKeySecret")

# 创建本地 SparkSession（local 模式）
spark = (
    SparkSession.builder.appName("LocalPySparkExample")
    .config("spark.jars", jars_str)
    .master("local[*]")
    .config("spark.driver.memory", "10g")
    .config("spark.driver.maxResultSize", "4g")
    .config("spark.sql.shuffle.partitions", "100")
    .config("spark.default.parallelism", "100")
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:InitiatingHeapOccupancyPercent=35")
    # S3A 配置
    .config("spark.hadoop.fs.s3a.endpoint", endpoint)                   # MinIO 服务地址
    .config("spark.hadoop.fs.s3a.access.key", access_key_id)            # MinIO 用户名
    .config("spark.hadoop.fs.s3a.secret.key", access_key_secret)        # MinIO 密码
    .config("spark.hadoop.fs.s3a.path.style.access", "true")            # 必须设为 true，MinIO 用路径模式
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .getOrCreate()
)

print('Spark Version: ', spark.version)

4.0.0
/home/jovyan/jars/hadoop-aliyun-3.4.1.jar,/home/jovyan/jars/hadoop-aws-3.4.1.jar,/home/jovyan/jars/bundle-2.32.24.jar,/home/jovyan/jars/aliyun-sdk-oss-3.18.2.jar,/home/jovyan/jars/paimon-oss-1.2.0.jar,/home/jovyan/jars/commons-configuration2-2.12.0.jar,/home/jovyan/jars/jdom2-2.0.6.1.jar
Spark Version:  4.0.0


In [16]:
!/home/linuxbrew/.linuxbrew/bin/mc ls dxp/dataset/tianchi/tianchi-news-rec/
# 同步数据到本地
!/home/linuxbrew/.linuxbrew/bin/mc cp -r dxp/dataset/tianchi/tianchi-news-rec ~/dataset/

]11;?\[6n[m[32m[2025-08-18 10:06:11 UTC][0m[33m 9.9MiB[0m [34mSTANDARD[0m[1m articles.csv[22m[m
[m[32m[2025-08-18 10:06:23 UTC][0m[33m 973MiB[0m [34mSTANDARD[0m[1m articles_emb.csv[22m[m
[m[32m[2025-08-18 10:06:12 UTC][0m[33m  20MiB[0m [34mSTANDARD[0m[1m testA_click_log.csv[22m[m
[m[32m[2025-08-18 10:06:12 UTC][0m[33m  44MiB[0m [34mSTANDARD[0m[1m train_click_log.csv[22m[m
...ck_log.csv: 1.02 GiB / 1.02 GiB ┃▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓┃ 269.72 MiB/s 3s[0;22m[0m[m[32;1m

In [1]:
# article_csv_path = 's3a://dataset/tianchi/tianchi-news-rec/articles.csv'
# article_df = spark.read.csv(article_csv_path, header=True)
# article_df.printSchema()

# article_emb_csv_path = 's3a://dataset/tianchi/tianchi-news-rec/articles_emb.csv'
# article_emb_df = spark.read.csv(article_emb_csv_path, header=True)
# article_emb_df.printSchema()

# train_click_csv_path = 's3a://dataset/tianchi/tianchi-news-rec/train_click_log.csv'
# train_click_df = spark.read.csv(train_click_csv_path, header=True)
# train_click_df.printSchema()

# testA_click_csv_path = 's3a://dataset/tianchi/tianchi-news-rec/testA_click_log.csv'
# testA_click_df = spark.read.csv(testA_click_csv_path, header=True)
# testA_click_df.printSchema()

# print(article_df.count(), article_emb_df.count())
# article_emb_df.show()
# article_df.show()

# 数据走读