# !Getting started

## !!Starting Point: SparkSession

In [None]:
from pyspark.sql import SparkSession

#### 在spark2.0以后，SparkSession创建SQLContext，并封装了sparkContext,  config就是sparkconf

In [None]:
spark=SparkSession\
.builder.\
appName("python spark sql example")\
.config("spark.some.config.option","some-value")\
.getOrCreate()

## !!creating dataFrames

In [3]:
df=spark.read.json("/user/gongxf/spark/examples/src/main/resources/people.json")

In [4]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



## !!Untyped Dataset Operations (aka DataFrame Operations)

In [None]:
df.printSchema()

In [None]:
df.select("name").show()

In [None]:
df.select(df['name'],df['age']+1).show()

In [None]:
df.filter(df['age']>21).show()

In [None]:
df.groupBy("age").count().show()

## !!running sql Queries prgrammatically

In [None]:
df.createOrReplaceTempView("people")

In [None]:
sqlDF=spark.sql("select * from people")

In [None]:
sqlDF.show()

## !!Global Temporary view

In [None]:
df.createGlobalTempView("people")

In [None]:
spark.sql("select * from global_temp.people").show()

In [None]:
spark.newSession().sql("select * from global_temp.people").show()

## !! interoperating with RDDS

### !!! inferring the schema using reflection

In [None]:
from pyspark.sql import Row

In [None]:
sc=spark.sparkContext

In [None]:
lines=sc.textFile("/user/gongxf/spark/examples/src/main/resources/people.txt")

In [None]:
parts=lines.map(lambda l:l.split(","))

In [None]:
people=parts.map(lambda p:Row(name=p[0],age=int(p[1])))

In [None]:
schemaPeople=spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

In [None]:
teenageres=spark.sql("select name from people where age>=13 and age<=19")

In [None]:
teenageres.show()

In [None]:
teenNames=teenageres.rdd.map(lambda p:"name"+p.name).collect()

In [None]:
for name in teenNames:
    print(name)

### !!! programmatically specifying the schema

In [None]:
from pyspark.sql.types import *

In [None]:
sc=spark.sparkContext

In [None]:
lines=sc.textFile("/user/gongxf/spark/examples/src/main/resources/people.txt")

In [None]:
parts=lines.map(lambda l:l.split(","))
people=parts.map(lambda p:(p[0],p[1].strip()))

In [None]:
schemaString="name age"

In [None]:
fields=[StructField(field_name,StringType(),True) for field_name in schemaString.split()]

In [None]:
schema=StructType(fields)

In [None]:
schemaPeople=spark.createDataFrame(people,schema)

In [None]:
schemaPeople.createOrReplaceTempView("people")

In [None]:
results=spark.sql("select name from people")

In [None]:
results.show()

# !Data sorces

## !! Generic load/save functions

In [None]:
df=spark.read.load("/user/gongxf/spark/examples/src/main/resources/users.parquet")

In [None]:
df.select("name","favorite_color").write.save("/user/gongxf/spark/examples/src/main/resources/namesAndFavColors.parquet")

### !!!Manually Specifying Options

In [None]:
df=spark.read.load("/user/gongxf/spark/examples/src/main/resources/people.json"
                  ,format="json")

In [None]:
df.select("name","age").write.save("namesAndAgess.parquet")

In [None]:
df=spark.read.load("/user/gongxf/spark/examples/src/main/resources/people.csv"
                  ,format='csv',sep=";",inferSchema='true',header="true")

In [None]:
df.show()

### !!!run sql on files directly

In [None]:
df=spark.sql("select * from parquet.`/user/gongxf/namesAndAgess.parquet`")

In [None]:
df.show()

### !!! save modes

### !!! Saving to Persistent tables

###  !!! bucketing,sorting and partitioning

In [None]:
df.show()

In [None]:
df.write.bucketBy(42,"name").sortBy("age").saveAsTable("people_bucketed1")

In [None]:
df=spark.read.parquet("/user/gongxf/spark/examples/src/main/resources/users.parquet")

In [None]:
df.write.partitionBy("favorite_color").bucketBy(42,"name")\
.saveAsTable("people_partitioned_bucketed")

## !!parquet files

### !!!loading data programmatically

In [None]:
peopleDF=spark.read.json("/user/gongxf/spark/examples/src/main/resources/people.json")

In [None]:
peopleDF.write.parquet("people.parquet")

In [None]:
parquetFile=spark.read.parquet("people.parquet")

In [None]:
parquetFile.createOrReplaceTempView("parquetFile")

In [None]:
teenagers=spark.sql("select name from parquetFile where age>=12 and age<=19")

In [None]:
teenageres.show()

### ！！！schema meerging

In [None]:
from pyspark.sql import Row

In [None]:
sc=spark.sparkContext

In [None]:
squaresDF=spark.createDataFrame(sc.parallelize(range(1,6))\
            .map(lambda i:Row(single=i,double=i**2)))

In [None]:
squaresDF.write.parquet("data/test_table/key=1")

In [None]:
cubesDF=spark.createDataFrame(sc.parallelize(range(6,11))\
        .map(lambda i:Row(single=i,triple=i**3)))

In [None]:
cubesDF.write.parquet("data/test_table/key=2")

In [None]:
mergedDF=spark.read.option("mergeSchema","true").parquet("data/test_table")

In [None]:
mergedDF.printSchema()

## Hive metastore Parquet table conversion

#### Metadata Refreshing

In [None]:
# spark.catalog.refreshTable("aa")

### !!!json Datasets

In [None]:
sc=spark.sparkContext

In [None]:
path="/user/gongxf/spark/examples/src/main/resources/people.json"

In [None]:
peopleDF=spark.read.json(path)

In [None]:
peopleDF.printSchema()

In [None]:
peopleDF.createOrReplaceTempView("people")

In [None]:
teenagerNamesDF=spark.sql("select name from people where age between 13 and 19")

In [None]:
teenagerNamesDF.show()

In [None]:
jsonStrings=['{"name":"Yin","address":{"city":"columbus","state":"ohio"}}']

In [None]:
otherPeopleRDD=sc.parallelize(jsonStrings)

In [None]:
otherPeople=spark.read.json(otherPeopleRDD)

In [None]:
otherPeople.show()

## !!hive table

In [None]:
from os.path import expanduser,join,abspath

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [None]:
warehouse_localtion=abspath('spark-warehouse')

In [None]:
spark=SparkSession\
.builder\
.appName("python spark")\
.config("spark.sql.warehouse.dir",warehouse_localtion)\
.enableHiveSupport()\
.getOrCreate()

In [None]:
spark.sql('create database if not exists kk')
spark.sql("use kk")

In [None]:
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")

In [None]:
spark.sql("load data local inpath 'data_examples/resources/kv1.txt' into table src")

In [None]:
spark.sql("select * from src").show()

In [None]:
spark.sql("select count(*) from src").show()

In [None]:
sqlDF=spark.sql("select key,value from src where key<10 order by key")

In [None]:
sqlDF.show()

In [None]:
stringsDS=sqlDF.rdd.map(lambda row:"key:%d,Value:%s" %(row.key,row.value))

In [None]:
for record in stringsDS.collect():
    print(record)

In [None]:
Record=Row("key","value")

In [None]:
recordsDF=spark.createDataFrame([])

In [None]:
recordsDF = spark.createDataFrame([Record(i, "val_" + str(i)) for i in range(1, 101)])

In [None]:
recordsDF.createOrReplaceTempView("records")

In [None]:
spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()

## !!PySpark Usage Guide for Pandas with Apache Arrow

### !!!Enabling for Conversion to/from Pandas

In [None]:
import numpy as np

In [None]:
import pandas as pd

In [None]:
spark.conf.set("spark.sql.execution.arrow.enabled","true")

In [None]:
pdf=pd.DataFrame(np.random.rand(100,3))

In [None]:
pdf.head(10)

In [None]:
df=spark.createDataFrame(pdf)

In [None]:
df.show()

In [None]:
result_pdf=df.select("*").toPandas()

In [None]:
result_pdf.head(3)

### !!!Pandas UDFs (a.k.a. Vectorized UDFs)

In [None]:
import pandas as pd

In [None]:
from pyspark.sql.functions import col,pandas_udf

In [None]:
from pyspark.sql.types import LongType

In [None]:
def multiply_func(a,b):
    return a*b

In [None]:
multiply=pandas_udf(multiply_func,returnType=LongType())

In [None]:
x=pd.Series([1,2,3])

In [None]:
x

In [None]:
print(multiply_func(x,x))

In [None]:
df=spark.createDataFrame(pd.DataFrame(x,columns=["x"]))

In [None]:
df.select(col("x")).show()

In [None]:
df.select(multiply(col("x"),col("x"))).show()

## !!Grouped map

In [None]:
from pyspark.sql.functions import pandas_udf,PandasUDFType

In [None]:
df=spark.createDataFrame(\
    [(1,1.0),(1,2.0),(2,3.0),(2,5.0),(2,10.0)],("id","v")\
)

In [None]:
@pandas_udf("id long,v double",PandasUDFType.GROUPED_MAP)
def substract_mean(pdf):
    v=pdf.v
    return pdf.assign(v=v-v.mean())

In [None]:
df.groupby("id").apply(substract_mean).show()

In [None]:
df.show()