In [11]:
# OP 
import datetime as dt   
import time
import csv
import requests
import pandas as pd, numpy as np

# SPARK 
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row
from operator import add

In [12]:
# config 
conf = SparkConf().setAppName("LOAD PTT MYSQL DATABASE")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

## 1) Pyspark Read csv as Spark DataFrame

In [31]:
# load the data
df_boston = sqlContext.read\
                      .format('com.databricks.spark.csv')\
                      .options(header='true', inferschema='true')\
                      .load('boston.csv')

In [37]:
type(df_boston)

pyspark.sql.dataframe.DataFrame

In [36]:
df_boston

DataFrame[CRIM: double, ZN: double, INDUS: double, CHAS: double, NOX: double, RM: double, AGE: double, DIS: double, RAD: double, TAX: double, PTRATIO: double, B: double, LSTAT: double, price: double]

In [38]:
df_boston.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: double (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: double (nullable = true)
 |-- TAX: double (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- price: double (nullable = true)



In [39]:
df_boston.columns

['CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'price']

In [41]:
df_boston.describe().show()

+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|summary|              CRIM|                ZN|             INDUS|              CHAS|                NOX|                RM|               AGE|              DIS|              RAD|               TAX|           PTRATIO|                 B|             LSTAT|             price|
+-------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+
|  count|               506|               506|               506|               506|                506|               506|               506|              506|              

In [54]:
df_boston.show()

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1|
|0.21124|12.5| 7.87| 0.0|0.524|5

## 2) Manually set DataFrame schema

In [43]:
from pyspark.sql.types import (StructField, StructType,
                               IntegerType, StringType, LongType)

In [48]:
data_schema = [StructField('CRIM',StringType(), True )]

In [49]:
final_struc = StructType(fields= data_schema)

In [50]:
# reload the csv BUT WITH PRE-DEFINED SCHEMA AS ABOVE 
df_boston_updated = sqlContext.read\
                      .format('com.databricks.spark.csv')\
                      .options(header='true', inferschema='true', shema= final_struc)\
                      .load('boston.csv')

In [52]:
df_boston_updated.printSchema()

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: double (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: double (nullable = true)
 |-- TAX: double (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- price: double (nullable = true)



## 3) Select one column from Spark DF 

In [63]:
# pandas way 
# which is not working here 
type(df_boston['CRIM'])
# df_boston['CRIM'].show() <--- not work

pyspark.sql.column.Column

In [62]:
# pyspark way 
type(df_boston.select('CRIM'))

pyspark.sql.dataframe.DataFrame

In [71]:
# SELECT 1 coluumn
df_boston.select('CRIM').show()

+-------+
|   CRIM|
+-------+
|0.00632|
|0.02731|
|0.02729|
|0.03237|
|0.06905|
|0.02985|
|0.08829|
|0.14455|
|0.21124|
|0.17004|
|0.22489|
|0.11747|
|0.09378|
|0.62976|
|0.63796|
|0.62739|
|1.05393|
| 0.7842|
|0.80271|
| 0.7258|
+-------+
only showing top 20 rows



In [70]:
# SELECT multiple coluumns
df_boston.select(['CRIM','B']).show()

+-------+------+
|   CRIM|     B|
+-------+------+
|0.00632| 396.9|
|0.02731| 396.9|
|0.02729|392.83|
|0.03237|394.63|
|0.06905| 396.9|
|0.02985|394.12|
|0.08829| 395.6|
|0.14455| 396.9|
|0.21124|386.63|
|0.17004|386.71|
|0.22489|392.52|
|0.11747| 396.9|
|0.09378| 390.5|
|0.62976| 396.9|
|0.63796|380.02|
|0.62739|395.62|
|1.05393|386.85|
| 0.7842|386.75|
|0.80271|288.99|
| 0.7258|390.95|
+-------+------+
only showing top 20 rows



In [68]:
df_boston.head(2)

[Row(CRIM=0.00632, ZN=18.0, INDUS=2.31, CHAS=0.0, NOX=0.538, RM=6.575, AGE=65.2, DIS=4.09, RAD=1.0, TAX=296.0, PTRATIO=15.3, B=396.9, LSTAT=4.98, price=24.0),
 Row(CRIM=0.02731, ZN=0.0, INDUS=7.07, CHAS=0.0, NOX=0.469, RM=6.421, AGE=78.9, DIS=4.9671, RAD=2.0, TAX=242.0, PTRATIO=17.8, B=396.9, LSTAT=9.14, price=21.6)]

In [69]:
df_boston.head(2)[0]

Row(CRIM=0.00632, ZN=18.0, INDUS=2.31, CHAS=0.0, NOX=0.538, RM=6.575, AGE=65.2, DIS=4.09, RAD=1.0, TAX=296.0, PTRATIO=15.3, B=396.9, LSTAT=4.98, price=24.0)

### 4) Create a  new column

In [84]:
# add new column : colX
df_boston.withColumn('colX', df_boston['B']).show()

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|  colX|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0| 396.9|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6| 396.9|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|392.83|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|394.63|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2| 396.9|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|394.12|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9| 395.6|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|

In [78]:
# add new column : colY
df_boston.withColumn('colY', df_boston['B']*2).show()

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|     B|LSTAT|price|  colY|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+------+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0| 793.8|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6| 793.8|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|785.66|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|789.26|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2| 793.8|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|788.24|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9| 791.2|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|

## 5) Rename columns 

In [83]:
# rename column "B" -> "BBB"
df_boston.withColumnRenamed('B', 'BBB').show()

+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|  TAX|PTRATIO|   BBB|LSTAT|price|
+-------+----+-----+----+-----+-----+-----+------+---+-----+-------+------+-----+-----+
|0.00632|18.0| 2.31| 0.0|0.538|6.575| 65.2|  4.09|1.0|296.0|   15.3| 396.9| 4.98| 24.0|
|0.02731| 0.0| 7.07| 0.0|0.469|6.421| 78.9|4.9671|2.0|242.0|   17.8| 396.9| 9.14| 21.6|
|0.02729| 0.0| 7.07| 0.0|0.469|7.185| 61.1|4.9671|2.0|242.0|   17.8|392.83| 4.03| 34.7|
|0.03237| 0.0| 2.18| 0.0|0.458|6.998| 45.8|6.0622|3.0|222.0|   18.7|394.63| 2.94| 33.4|
|0.06905| 0.0| 2.18| 0.0|0.458|7.147| 54.2|6.0622|3.0|222.0|   18.7| 396.9| 5.33| 36.2|
|0.02985| 0.0| 2.18| 0.0|0.458| 6.43| 58.7|6.0622|3.0|222.0|   18.7|394.12| 5.21| 28.7|
|0.08829|12.5| 7.87| 0.0|0.524|6.012| 66.6|5.5605|5.0|311.0|   15.2| 395.6|12.43| 22.9|
|0.14455|12.5| 7.87| 0.0|0.524|6.172| 96.1|5.9505|5.0|311.0|   15.2| 396.9|19.15| 27.1|
|0.21124|12.5| 7.87| 0.0|0.524|5

## 6) Pyspark SQL

In [90]:
# create a temp SQL view from df 
df_boston.createOrReplaceTempView('BOSTON')

In [91]:
result=sqlContext.sql("select B from BOSTON LIMIT 10").show()

+------+
|     B|
+------+
| 396.9|
| 396.9|
|392.83|
|394.63|
| 396.9|
|394.12|
| 395.6|
| 396.9|
|386.63|
|386.71|
+------+

