In [1]:
# Must be included at the beginning of each new notebook. Remember to change the app name.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS').getOrCreate()

In [2]:
# create correct data schema.
from pyspark.sql.types import *
schema = StructType([StructField('No',IntegerType(),True),
               StructField('year',IntegerType(),True),
               StructField('month',IntegerType(),True),
               StructField('day',IntegerType(),True),
               StructField('hour',IntegerType(),True),
               StructField('PM2.5',FloatType(),True),
               StructField('PM10',FloatType(),True),
               StructField('SO2',FloatType(),True),
               StructField('NO2',FloatType(),True),
               StructField('CO',FloatType(),True),
               StructField('O3',FloatType(),True),
               StructField('TEMP',FloatType(),True),
               StructField('PRES',FloatType(),True),
               StructField('DEWP',FloatType(),True),
               StructField('RAIN',FloatType(),True),
               StructField('wd',StringType(),True),
               StructField('WSPM',FloatType(),True),
               StructField('station',StringType(),True)])

In [3]:
df_with_schema = spark.read.format("csv") \
      .option("header", True) \

      .load("../DataSet/PRSA_Data_Aotizhongxin_20130301-20170228.csv")
df_with_schema.show()

+---+----+-----+---+----+-----+----+----+----+-----+----+----+------+-----+----+---+----+------------+
| No|year|month|day|hour|PM2.5|PM10| SO2| NO2|   CO|  O3|TEMP|  PRES| DEWP|RAIN| wd|WSPM|     station|
+---+----+-----+---+----+-----+----+----+----+-----+----+----+------+-----+----+---+----+------------+
|  1|2013|    3|  1|   0|  4.0| 4.0| 4.0| 7.0|300.0|77.0|-0.7|1023.0|-18.8| 0.0|NNW| 4.4|Aotizhongxin|
|  2|2013|    3|  1|   1|  8.0| 8.0| 4.0| 7.0|300.0|77.0|-1.1|1023.2|-18.2| 0.0|  N| 4.7|Aotizhongxin|
|  3|2013|    3|  1|   2|  7.0| 7.0| 5.0|10.0|300.0|73.0|-1.1|1023.5|-18.2| 0.0|NNW| 5.6|Aotizhongxin|
|  4|2013|    3|  1|   3|  6.0| 6.0|11.0|11.0|300.0|72.0|-1.4|1024.5|-19.4| 0.0| NW| 3.1|Aotizhongxin|
|  5|2013|    3|  1|   4|  3.0| 3.0|12.0|12.0|300.0|72.0|-2.0|1025.2|-19.5| 0.0|  N| 2.0|Aotizhongxin|
|  6|2013|    3|  1|   5|  5.0| 5.0|18.0|18.0|400.0|66.0|-2.2|1025.6|-19.6| 0.0|  N| 3.7|Aotizhongxin|
|  7|2013|    3|  1|   6|  3.0| 3.0|18.0|32.0|500.0|50.0|-2.6|1026.5|-19.

In [4]:
df_with_schema.printSchema()

root
 |-- No: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- PM2.5: float (nullable = true)
 |-- PM10: float (nullable = true)
 |-- SO2: float (nullable = true)
 |-- NO2: float (nullable = true)
 |-- CO: float (nullable = true)
 |-- O3: float (nullable = true)
 |-- TEMP: float (nullable = true)
 |-- PRES: float (nullable = true)
 |-- DEWP: float (nullable = true)
 |-- RAIN: float (nullable = true)
 |-- wd: string (nullable = true)
 |-- WSPM: float (nullable = true)
 |-- station: string (nullable = true)



In [5]:
new_rename_df = df_with_schema.withColumnRenamed("DEWP","DewPointTempeature") \
    .withColumnRenamed("wd","WindDirection")\
    .withColumnRenamed("WSPM","WindSpend")\
    .withColumnRenamed("PM2.5","PM25")
new_rename_df.show()

+---+----+-----+---+----+----+----+----+----+-----+----+----+------+------------------+----+-------------+---------+------------+
| No|year|month|day|hour|PM25|PM10| SO2| NO2|   CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|     station|
+---+----+-----+---+----+----+----+----+----+-----+----+----+------+------------------+----+-------------+---------+------------+
|  1|2013|    3|  1|   0| 4.0| 4.0| 4.0| 7.0|300.0|77.0|-0.7|1023.0|             -18.8| 0.0|          NNW|      4.4|Aotizhongxin|
|  2|2013|    3|  1|   1| 8.0| 8.0| 4.0| 7.0|300.0|77.0|-1.1|1023.2|             -18.2| 0.0|            N|      4.7|Aotizhongxin|
|  3|2013|    3|  1|   2| 7.0| 7.0| 5.0|10.0|300.0|73.0|-1.1|1023.5|             -18.2| 0.0|          NNW|      5.6|Aotizhongxin|
|  4|2013|    3|  1|   3| 6.0| 6.0|11.0|11.0|300.0|72.0|-1.4|1024.5|             -19.4| 0.0|           NW|      3.1|Aotizhongxin|
|  5|2013|    3|  1|   4| 3.0| 3.0|12.0|12.0|300.0|72.0|-2.0|1025.2|             -19.5| 0.

In [6]:
new_rename_df.count()

35064

In [7]:
new_rename_df.filter( (new_rename_df["WindDirection"] == 'NA')).show()

Py4JJavaError: An error occurred while calling o41.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage 4.0 (TID 4, localhost, executor driver): java.text.ParseException: Unparseable number: "NA"
	at java.text.NumberFormat.parse(NumberFormat.java:385)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$$anonfun$castTo$2.apply$mcF$sp(CSVInferSchema.scala:261)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$$anonfun$castTo$2.apply(CSVInferSchema.scala:261)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$$anonfun$castTo$2.apply(CSVInferSchema.scala:261)
	at scala.util.Try.getOrElse(Try.scala:79)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$.castTo(CSVInferSchema.scala:261)
	at org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:125)
	at org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:94)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:167)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:166)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:333)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2386)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
	at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2788)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2385)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2392)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2128)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2127)
	at org.apache.spark.sql.Dataset.withTypedCallback(Dataset.scala:2818)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2127)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2342)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:248)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.text.ParseException: Unparseable number: "NA"
	at java.text.NumberFormat.parse(NumberFormat.java:385)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$$anonfun$castTo$2.apply$mcF$sp(CSVInferSchema.scala:261)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$$anonfun$castTo$2.apply(CSVInferSchema.scala:261)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$$anonfun$castTo$2.apply(CSVInferSchema.scala:261)
	at scala.util.Try.getOrElse(Try.scala:79)
	at org.apache.spark.sql.execution.datasources.csv.CSVTypeCast$.castTo(CSVInferSchema.scala:261)
	at org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:125)
	at org.apache.spark.sql.execution.datasources.csv.CSVRelation$$anonfun$csvParser$3.apply(CSVRelation.scala:94)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:167)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat$$anonfun$buildReader$1$$anonfun$apply$2.apply(CSVFileFormat.scala:166)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:109)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:377)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:225)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:827)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
from pyspark.sql.functions import col, when

In [None]:
KK = new_rename_df
for i in new_rename_df.columns:
    Kitty = KK.withColumn(i,when((col(i)=='NA'),None).otherwise(col(i)))
    KK = Kitty

In [None]:
after_delete = KK.na.drop()

In [None]:
after_delete.count()

In [None]:
after_delete