In [None]:
from pyspark.sql import SparkSession
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

# 创建 SparkSession
spark = SparkSession.builder \
    .appName("PySpark Dataloader") \
    .getOrCreate()

# 读取数据
df_spark = spark.read.csv("test_pyspark.csv", header=True, inferSchema=True)

# 假设我们需要这些特征作为模型输入：feature1, feature2, ... , featureN
# 和目标变量：target
df_spark = df_spark.select("feature1", "feature2", "featureN", "target")

# 展示预处理后的数据
df_spark.show(5)

In [None]:
sales_df = spark.read.csv( "daily_sales_*.csv", header=True, inferSchema=True) 

In [None]:
sales_df.head(n=10)

In [None]:
from pyspark.sql import functions as F
sales_df.filter(F.isnull(sales_df['customer_id']))

In [None]:
cleaned_df = sales_df.dropDuplicates(["order_id"]).fillna({"customer_id":0, "amount":0.0})

In [19]:
from pyspark.sql.functions import when, sum, col, count

transformed_df = cleaned_df.withColumn("sales_category", 
                    when(col("amount") > 1000, "high" ). 
                    when(col("amount") > 500, "medium"). 
                    otherwise("low")
                ).groupBy("region", "sales_category").agg(
                    sum("amount").alias( "total_sales"), 
                    count("*").alias("order_count")
                )

In [21]:
# 5. 写入 Hive 分区表
transformed_df.write \
    .partitionBy("region") \
    .format("parquet") \
    .saveAsTable("regional_sales")

                                                                                

In [22]:
# 6. 记录元数据（可选
spark.sql("ANALYZE TABLE regional_sales COMPUTE STATISTICS")

DataFrame[]

In [31]:
transformed_df.columns

['region', 'sales_category', 'total_sales', 'order_count']

In [41]:
df = spark.read.csv ( "salary.csv", header=True, inferSchema=True, escape='"')

In [42]:
df.head(n=10)

[Row(id=1, name='c1', age=20, department='finance', salary=10),
 Row(id=2, name='c2', age=12, department='it', salary=15),
 Row(id=3, name='c3', age=32, department='finance', salary=12),
 Row(id=4, name='c4', age=18, department='it', salary=13)]

In [43]:
# 示例1: 列操作 + 过滤 
from pyspark.sql.functions import col, upper 

transformed_df = df.select(
    upper(col("name")).alias("name_upper"), 
    col("age").cast("integer"), 
    (col("salary") * 1.1).alias("adjusted_salary")
).filter(col("age") > 18 )

In [44]:
transformed_df.head(n=10)

[Row(name_upper='C1', age=20, adjusted_salary=11.0),
 Row(name_upper='C3', age=32, adjusted_salary=13.200000000000001)]

In [45]:
# 示例2: 聚合操作
agg_df = df.groupBy( "department") \
    .agg({"salary" : "avg", "id" : "count"}) \
    .withColumnRenamed("avg(salary)", "avg_salary")

In [46]:
agg_df.head(n=10)

[Row(department='finance', count(id)=2, avg_salary=11.0),
 Row(department='it', count(id)=2, avg_salary=14.0)]

In [24]:
import numpy as np
import pandas as pd

data = np.random.randn(100, 4)
df = pd.DataFrame(data, dtype=np.float32, columns=['feature1', 'feature2', 'featureN', 'target'])
df.to_csv('test_pyspark.csv')

In [None]:
# 将 Spark DataFrame 转换为 Pandas DataFrame
#df_pandas = df_spark.toPandas()
df_pandas = df

# 将特征和标签分别提取
X = df_pandas[["feature1", "feature2", "featureN"]].values
y = df_pandas["target"].values

# 将特征和标签转换为 PyTorch 张量
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)


In [None]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 创建数据集对象
dataset = CustomDataset(X_tensor, y_tensor)

In [None]:
# 定义 DataLoader
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# 打印批次数据
for batch_X, batch_y in dataloader:
    print(batch_X.shape, batch_y.shape)


In [None]:
import torch.nn as nn
import torch.optim as optim

# 定义一个简单的模型
class SimpleModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return self.fc(x)

# 实例化模型
model = SimpleModel(input_dim=X_tensor.shape[1])

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for batch_X, batch_y in dataloader:
        print(batch_X.shape, batch_y.shape)
        # 前向传播
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.unsqueeze(1))
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
