# 创建数据表

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
        .builder \
        .appName("spark-hive") \
        .config("spark.ui.port", "4041") \
        .enableHiveSupport() \
        .getOrCreate()

In [4]:
peopleDF = spark.read.json("spark/examples/src/main/resources/people.json")

In [5]:
peopleDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



### 会在spark sql 里建立people1数据表

In [9]:
peopleDF.write.format("parquet").mode("append").saveAsTable("people")

## Basics

In [10]:
textFile=spark.read.text("/user/gongxf/spark/README.md")

In [12]:
textFile.write.format("parquet").mode("append").saveAsTable("text")

In [11]:
textFile.count()

103

In [8]:
textFile.first()

Row(value='# Apache Spark')

In [14]:
textFile.filter(textFile.value.contains("Spark")).show()
#.count()/.first()

+--------------------+
|               value|
+--------------------+
|      # Apache Spark|
|Spark is a fast a...|
|rich set of highe...|
|and Spark Streami...|
|You can find the ...|
|   ## Building Spark|
|Spark is built us...|
|To build Spark an...|
|You can build Spa...|
|["Building Spark"...|
|For general devel...|
|The easiest way t...|
|Spark also comes ...|
|    ./bin/run-exa...|
|    MASTER=spark:...|
|Testing first req...|
|Spark uses the Ha...|
|Hadoop, you must ...|
|in the online doc...|
|Please review the...|
+--------------------+



## More on Dataset Operations

In [15]:
from pyspark.sql.functions import *

In [17]:
textFile.select(size(split(textFile.value,"\s+")).
                name("numWords")).agg(max(col("numWords"))).collect()

[Row(max(numWords)=22)]

In [19]:
wordCounts=textFile.select(explode(split(textFile.value,"\s+"))
                          .alias("word")).groupBy("word").count()

In [22]:
wordCounts.collect()

[Row(word='online', count=1),
 Row(word='graphs', count=1),
 Row(word='["Parallel', count=1),
 Row(word='["Building', count=1),
 Row(word='thread', count=1),
 Row(word='documentation', count=3),
 Row(word='command,', count=2),
 Row(word='abbreviated', count=1),
 Row(word='overview', count=1),
 Row(word='rich', count=1),
 Row(word='set', count=2),
 Row(word='-DskipTests', count=1),
 Row(word='name', count=1),
 Row(word='page](http://spark.apache.org/documentation.html).', count=1),
 Row(word='["Specifying', count=1),
 Row(word='stream', count=1),
 Row(word='run:', count=1),
 Row(word='not', count=1),
 Row(word='programs', count=2),
 Row(word='tests', count=2),
 Row(word='./dev/run-tests', count=1),
 Row(word='will', count=1),
 Row(word='[run', count=1),
 Row(word='particular', count=2),
 Row(word='option', count=1),
 Row(word='Alternatively,', count=1),
 Row(word='by', count=1),
 Row(word='must', count=1),
 Row(word='using', count=5),
 Row(word='you', count=4),
 Row(word='MLlib', count=