In [None]:
import pyspark
from pyspark.sql import SparkSession


print(pyspark.__version__)
spark = SparkSession.builder.appName("windowFunctionExample").getOrCreate()

In [None]:
data = [
    ("Alice", "Sales", 2000),
    ("Bob", "Sales", 1500),
    ("Alice", "Engineering", 2500),
    ("David", "Engineering", 3000),
    ("Bob", "Engineering", 2000)
]
columns = ["name", "department", "salary"]
df = spark.createDataFrame(data, schema=columns)


In [None]:
from pyspark.sql import functions as F

df1 = df.groupBy('name').agg(F.count('*').alias('count'))
df1 = df.groupBy('name').count().withColumnRenamed('count', 'count')
df1.show()

In [None]:
import torch
import torch.nn as nn

# 检查 GPU 是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"当前使用的设备是: {device}")

# 查询 GPU 设备信息
if torch.cuda.is_available():
    print(f"GPU 名称: {torch.cuda.get_device_name(0)}")
    print(f"GPU 数量: {torch.cuda.device_count()}")

# 定义一个简单的神经网络模型
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.linear = nn.Linear(10, 1)

    def forward(self, x):
        return self.linear(x)

# 实例化模型并将其移动到 GPU
model = SimpleModel().to(device)

# 创建一些随机数据并通过模型执行前向传播
# 假设我们有一个大小为 (batch_size, input_size) 的输入
batch_size = 5
input_size = 10
inputs = torch.randn(batch_size, input_size).to(device)
outputs = model(inputs)

print(f"模型的输出是: {outputs}")


In [None]:
import torch

a = 4
b = [0, 1, 0, 0]
a = torch.tensor(a, dtype=torch.long).unsqueeze(0).unsqueeze(1).unsqueeze(0)
a = a.squeeze(1)
print(a.shape)
a.unsqueeze(-1)
print(a.size())
b = torch.tensor(b, dtype=torch.float).unsqueeze(0).unsqueeze(0)
print(b.size())
res = torch.cat([a, b], dim=2)
print(res)
print(res[:,:, 0:2])

import torch.nn as nn
import torch.nn.functional as F


class FeatureEmbedder(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super(FeatureEmbedder, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)

    def forward(self, x):
        return self.embedding(x)
    
a_embedder = FeatureEmbedder(6, 4)
# 创建一个张量 a，包含 6 个不同的取值
a_values = res
print(a_values[:,:,1:].shape)

# 获取 a 的嵌入向量表示
a_embeddings = a_embedder(a_values[:,:, 0].long())

# 打印嵌入向量
print("a 的嵌入向量表示:")
print(a_embeddings)
print(a_embeddings.shape)

z = torch.cat([a_embeddings, a_values[:,:, 1:]], dim=2)
print(z)




In [None]:
import torch  
import torch.nn as nn
  
# 创建一个一维张量  
x = torch.tensor([[3.0], [1.0], [2.0]])  
y = torch.tensor([[1.0, 2.0, 3.0],
                  [4.0, 5.0, 6.0]])
print(y.shape)
for i in range(y.size(0)):
    print(y[i, 0].item())

# 升序排序的索引  
ascending_indices = torch.argsort(x, dim=0)  
print(ascending_indices)  
  
# 降序排序的索引  
descending_indices = torch.argsort(x, dim=0, descending=True)  
print(descending_indices)  

In [None]:
import torch  
  
# 创建三个形状为(2, 3)的张量  
t1 = torch.tensor([[1, 2, 3]])  
t2 = torch.tensor([[7, 8, 9]])  
t3 = torch.tensor([[13, 14, 15]])  
t4 = torch.tensor([[17, 18, 19]])
t12 = torch.cat([t1, t2], dim=1)
t34 = torch.cat([t3, t4], dim=1)
print(t12.shape)
t = []
t.append(t12)
t.append(t34)
# 沿着第0维堆叠这三个张量  
print(t)
stacked_tensors = torch.stack(t)  
  
print(stacked_tensors.shape)  


In [None]:
import torch 
import torch.nn.functional as F
import torch.nn as nn

batch_size = 8
ans = []

# 生成随机整数张量作为输入，假设输入数据的每个元素都是从0到9的整数
x = [1, 2, 3]
y = [2, 3, 4]
ans.append(x)
ans.append(y)

print(torch.tensor(ans))

    



In [None]:
import torch
a = torch.tensor([1,2]).unsqueeze(0)
b = torch.tensor([3,4]).unsqueeze(0)
c = torch.cat([a, b], dim=0)

c

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType

# 初始化Spark会话
spark = SparkSession.builder.appName("Rating Vectors Example").getOrCreate()

# 定义数据结构
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", IntegerType(), True)
])

# 示例数据
data = [
    {'userId': 123, 'movieId': 45, 'rating': 4},
    {'userId': 123, 'movieId': 56, 'rating': 5},
    {'userId': 789, 'movieId': 45, 'rating': 3},
    {'userId': 123, 'movieId': 45, 'rating': 3},
    {'userId': 789, 'movieId': 56, 'rating': 4}
]

# 将Python列表转换为DataFrame
rating_df = spark.createDataFrame(data, schema)

# 显示DataFrame的内容
rating_df.show()


In [None]:

import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, IntegerType
from pyspark.ml.linalg import Vectors, VectorUDT



# 创建一个用户评分向量的UDF
def create_rating_vector(ratings):
    # 初始化一个长度为5的向量，表示评分1到5
    rating_vector = [0] * 5
    # 遍历评分，对相应的位置进行计数
    for rating in ratings:
        rating_vector[rating - 1] += 1
    # 返回向量
    return Vectors.dense(rating_vector)

# 注册UDF
create_rating_vector_udf = F.udf(create_rating_vector, VectorUDT())

# 聚合每个用户的评分
user_rating_vector_df = rating_df.groupBy("userId").agg(
    create_rating_vector_udf(F.collect_list("rating")).alias("user_rating_vector"),
    F.count("rating").alias("ratings_count")
)

# 计算最终的用户评分向量
def normalize_vector(vector, count):
    return Vectors.dense([x / count for x in vector])

normalize_vector_udf = F.udf(normalize_vector, VectorUDT())

user_rating_vector_df = user_rating_vector_df.withColumn(
    "normalized_user_rating_vector",
    normalize_vector_udf(F.col("user_rating_vector"), F.col("ratings_count"))
)

# 同理，计算电影评分向量
movie_rating_vector_df = rating_df.groupBy("movieId").agg(
    create_rating_vector_udf(F.collect_list("rating")).alias("movie_rating_vector"),
    F.count("rating").alias("ratings_count")
)

movie_rating_vector_df = movie_rating_vector_df.withColumn(
    "normalized_movie_rating_vector",
    normalize_vector_udf(F.col("movie_rating_vector"), F.col("ratings_count"))
)

# 显示结果
user_rating_vector_df.show(truncate=False)
movie_rating_vector_df.show(truncate=False)


In [None]:
import torch
import numpy as np

a = torch.tensor([1,2,3])


In [None]:
    from pyspark.sql import SparkSession
    from pyspark.sql import functions as F
    from pyspark.sql.window import Window

    # 初始化Spark会话
    spark = SparkSession.builder.appName("RandomAndWeightSort").getOrCreate()

    # 假设movie_click_freq DataFrame已经存在，并且包含movie_id, click_count, sampling_weight列
    # 示例数据结构
    movie_click_freq = spark.createDataFrame([
        (1, 1, 0.5),
        (1, 2, 0.6),
        (2, 1, 0.7),
        (1, 3, 0.8),
        (1, 4, 0.3),
        (1, 5, 0.6),
        (1, 6, 0.7),
        (2, 2, 0.9),
        (2, 3, 0.8),
        (3, 3, 0.8),
        (4, 3, 0.4)
    ], ["user", "movie", "sampling_weight"])

    # 定义窗口规范
    window_spec = Window.partitionBy("user").orderBy(F.rand().multiply(0.1).cast("int"), F.col("sampling_weight").desc())


    # 在窗口规范下，为每个用户的电影分配行号
    user_negative_movies = movie_click_freq.withColumn("row_num", F.row_number().over(window_spec))

    # 显示结果，这里可以替换为你的具体逻辑，比如选择行号小于某个阈值的数据作为负样本
    user_negative_movies.show()


In [None]:
# 停止Spark会话
spark.stop()