In [1]:
#搞不清楚读文件和列表的区别
#https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD
import findspark
findspark.init()


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
import pyspark.sql.functions as func

In [3]:
spark = SparkSession.builder\
    .config("sparksql.warehouse.dir","file:///c:/temp")\
    .appName("dat_frame").getOrCreate()
spark.conf.set("spark.executor.memory","4g")
sc = spark.sparkContext#注意是sparkContext，不是SparkContext

In [4]:
rdd =sc.textFile("people.csv")
Person =Row("first_name","last_name","gender","age")
def line_to_person(line):
    cells =line.split(",")
    cells[3] = int(cells[3])
    return Person(*cells)

peopleRDD=rdd.map(line_to_person)

In [5]:
peopleRDD.take(2)

[Row(first_name='Ededd', last_name='Shankj', gender='F', age=43),
 Row(first_name='Noem', last_name='Lii', gender='F', age=43)]

In [6]:
df = peopleRDD.toDF()
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)



In [9]:
df = spark.createDataFrame(peopleRDD)
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: long (nullable = true)



In [16]:
from collections import namedtuple
Person =namedtuple("people",["first_name","last_name","gender","age"])

def line_to_person(line):
    cells =line.split(",")
    return Person(cells[0],cells[1],cells[2],int(cells[3]))
personRDD1 = rdd.map(line_to_person)
df_1 = spark.createDataFrame(personRDD1)
df_1.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)



In [17]:
df.show()#也可以指定显示多少个

+----------+---------+------+---+
|first_name|last_name|gender|age|
+----------+---------+------+---+
|     Ededd|   Shankj|     F| 43|
|      Noem|      Lii|     F| 43|
|      Rgfj|       Wu|     M| 23|
|      Djgd|     Sjdh|     M| 44|
+----------+---------+------+---+



In [19]:
df.select(df["first_name"],df["gender"],df["age"]>23).show()

+----------+------+----------+
|first_name|gender|(age > 23)|
+----------+------+----------+
|     Ededd|     F|      true|
+----------+------+----------+
only showing top 1 row



In [20]:
df.select(df["first_name"],df["gender"],df["age"]>23,df.age+5).show()

+----------+------+----------+---------+
|first_name|gender|(age > 23)|(age + 5)|
+----------+------+----------+---------+
|     Ededd|     F|      true|       48|
|      Noem|     F|      true|       48|
|      Rgfj|     M|     false|       28|
|      Djgd|     M|      true|       49|
+----------+------+----------+---------+



In [21]:
#如何过滤数据
df.filter(df.age >25).select("first_name","age").show()

+----------+---+
|first_name|age|
+----------+---+
|     Ededd| 43|
|      Noem| 43|
|      Djgd| 44|
+----------+---+



In [22]:
df.filter((df.age >25)&(df.gender=='F')).select("first_name","age").show()

+----------+---+
|first_name|age|
+----------+---+
|     Ededd| 43|
|      Noem| 43|
+----------+---+



In [23]:
df.where((df.age >25)&(df.gender=='F')).select("first_name","age").show()

+----------+---+
|first_name|age|
+----------+---+
|     Ededd| 43|
|      Noem| 43|
+----------+---+



In [24]:
df.filter?  #c查看帮助

In [25]:
df.filter((df.age >25)&(df.gender=='F')).orderBy("first_name","age").show()

+----------+---------+------+---+
|first_name|last_name|gender|age|
+----------+---------+------+---+
|     Ededd|   Shankj|     F| 43|
|      Noem|      Lii|     F| 43|
+----------+---------+------+---+



In [33]:
df.filter(df.age >25).select("first_name","age").orderBy(["age","first_name"],ascending=[1,0]).show()#orderBy(["age","first_name"],ascending=[1,0])中间是逗号

+----------+---+
|first_name|age|
+----------+---+
|      Noem| 43|
|     Ededd| 43|
|      Djgd| 44|
+----------+---+



In [37]:
df.select(df["first_name"],df["gender"],(df["age"]<24).alias("young")).show()

+----------+------+-----+
|first_name|gender|young|
+----------+------+-----+
|     Ededd|     F|false|
|      Noem|     F|false|
|      Rgfj|     M| true|
|      Djgd|     M|false|
+----------+------+-----+



In [7]:
df.registerTempTable("names")

In [36]:
spark.sql("SELECT first_name,last_name,age,age<30 AS young FROM names").show()

+----------+---------+---+-----+
|first_name|last_name|age|young|
+----------+---------+---+-----+
|     Ededd|   Shankj| 43|false|
|      Noem|      Lii| 43|false|
|      Rgfj|       Wu| 23| true|
|      Djgd|     Sjdh| 44|false|
+----------+---------+---+-----+



In [39]:
df.groupBy("age").count().show()#分组统计

+---+-----+
|age|count|
+---+-----+
| 43|    2|
| 44|    1|
| 23|    1|
+---+-----+



In [40]:
spark.sql("SELECT age,count(age) FROM names GROUP BY age").show()

+---+----------+
|age|count(age)|
+---+----------+
| 43|         2|
| 44|         1|
| 23|         1|
+---+----------+



In [None]:
df2 =spark.read.json('')