In [5]:
%%writefile requirements.txt

import findspark
findspark.init()

import pyspark
print(findspark.find())

from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster("local").setAppName("Spark Practice")

sc = SparkContext(conf=conf)

from pyspark.sql import SparkSession
spark = SparkSession(sc)

Overwriting requirements.txt


In [9]:
# %load requirements.txt

import findspark
findspark.init()

import pyspark
print(findspark.find())

from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster("local").setAppName("Spark Practice")

sc = SparkContext(conf=conf)

from pyspark.sql import SparkSession
spark = SparkSession(sc)


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Spark Practice, master=local) created by __init__ at <ipython-input-4-0bb69b5bcd56>:13 

In [10]:
spark

# 1. JSON RDD를 만들고 데이터프레임으로 변환하기

In [11]:
stringJSONRDD = sc.parallelize((""" 
  { "id": "123",
    "name": "Katie",
    "age": 19,
    "eyeColor": "brown"
  }""",
   """{
    "id": "234",
    "name": "Michael",
    "age": 22,
    "eyeColor": "green"
  }""", 
  """{
    "id": "345",
    "name": "Simone",
    "age": 23,
    "eyeColor": "blue"
  }""")
)

In [13]:
stringJSONRDD

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

## 데이터프레임으로 변환하기

In [12]:
# Create DataFrame
swimmersJSON = spark.read.json(stringJSONRDD)

## 임시파일을 만들기 

In [14]:
# Create temporary table
swimmersJSON.createOrReplaceTempView("swimmersJSON")

In [16]:
swimmersJSON

DataFrame[age: bigint, eyeColor: string, id: string, name: string]

## 데이터프레임의 일부 데이터 보기

In [15]:
# DataFrame API
swimmersJSON.show()

+---+--------+---+-------+
|age|eyeColor| id|   name|
+---+--------+---+-------+
| 19|   brown|123|  Katie|
| 22|   green|234|Michael|
| 23|    blue|345| Simone|
+---+--------+---+-------+



## 데이터프레임의 스키마 보기

In [20]:
# Print the schema
swimmersJSON.printSchema()

root
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)



## 데이터의 일부를 가져오기

In [33]:
swimmersJSON.take(3)

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='345', name='Simone')]

## 직접 쿼리로 데이터 조회하기

In [21]:
# SQL Query
a = spark.sql("select * from swimmersJSON")

In [22]:
a

DataFrame[age: bigint, eyeColor: string, id: string, name: string]

## 데이터 전부를 가져오기 :

      많은 양을 처리할 때는 주의해야 한다. 
      특정 양을 보는 take, show를 사용한다.

In [28]:
b = a.collect()

In [32]:
b

[Row(age=19, eyeColor='brown', id='123', name='Katie'),
 Row(age=22, eyeColor='green', id='234', name='Michael'),
 Row(age=23, eyeColor='blue', id='345', name='Simone')]

In [29]:
type(b)

list

In [31]:
b[0]['age']

19

# 2 CSV RDD를 만들고 데이터프레임으로 변환하기

In [35]:
from pyspark.sql.types import *

## RDD 만들기

In [34]:
# Generate our own CSV data 
#   This way we don't have to access the file system yet.
stringCSVRDD = sc.parallelize([(123, 'Katie', 19, 'brown'), (234, 'Michael', 22, 'green'), (345, 'Simone', 23, 'blue')])


## 스키마 만들기 

In [36]:
# The schema is encoded in a string, using StructType we define the schema using various pyspark.sql.types
schemaString = "id name age eyeColor"
schema = StructType([
    StructField("id", LongType(), True),    
    StructField("name", StringType(), True),
    StructField("age", LongType(), True),
    StructField("eyeColor", StringType(), True)
])

## 데이터프레임으로 변환하기

     - RDD와 스키마를 전달해서 생성한다.

In [37]:
# Apply the schema to the RDD and Create DataFrame
swimmers = spark.createDataFrame(stringCSVRDD, schema)

## 쿼리를 사용하기 위해 임시 테이블을 만든다

In [38]:
# Creates a temporary view using the DataFrame
swimmers.createOrReplaceTempView("swimmers")

In [39]:
# Print the schema
#   Notice that we have redefined id as Long (instead of String)
swimmers.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)



## SQL 실행하기

In [40]:
# Execute SQL Query and return the data
spark.sql("select * from swimmers").show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [41]:
# Get count of rows in SQL
spark.sql("select count(1) from swimmers").show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



In [42]:
swimmers.count()

3

## 데이터프레임을 쿼리 처리 사용하기

In [45]:
swimmers.select("age")

DataFrame[age: bigint]

### 처리된 결과를 보려면 show 메소드를 실행한다

In [44]:
swimmers.select("age").show()

+---+
|age|
+---+
| 19|
| 22|
| 23|
+---+



In [46]:
swimmers.select(swimmers.age).show()

+---+
|age|
+---+
| 19|
| 22|
| 23|
+---+



### 선택항 열에 제약을 하기

        - filter 메소드에 논리값을 지정한다

In [47]:
swimmers.select(swimmers.age, swimmers.id).filter(swimmers.age == 22)

DataFrame[age: bigint, id: bigint]

In [49]:
swimmers.select(swimmers.age, swimmers.id).filter(swimmers.age == 22).show()

+---+---+
|age| id|
+---+---+
| 22|234|
+---+---+



In [52]:
# Query id and age for swimmers with age = 22 via DataFrame API
swimmers.select("id", "age").filter("age = 22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [53]:
# Query id and age for swimmers with age = 22 in SQL
spark.sql("select id, age from swimmers where age = 22").show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



### 필터 내에 sql 문법에 따라 표시할 수도 있다

In [51]:
swimmers.select(swimmers.name, swimmers.eyeColor).filter("eyeColor like 'b%'").show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



In [54]:
# Query name and eye color for swimmers with eye color starting with the letter 'b'
spark.sql("select name, eyeColor from swimmers where eyeColor like 'b%'").show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



# 2. 데이터프레임 API 

In [55]:
# Show the values 
swimmers.show()

+---+-------+---+--------+
| id|   name|age|eyeColor|
+---+-------+---+--------+
|123|  Katie| 19|   brown|
|234|Michael| 22|   green|
|345| Simone| 23|    blue|
+---+-------+---+--------+



In [56]:
# Using Databricks `display` command to view the data easier
display(swimmers)

DataFrame[id: bigint, name: string, age: bigint, eyeColor: string]

In [57]:
# Get count of rows
swimmers.count()

3

# 3. 두 개의 데이터프레임 만들고 테이블 처리하기

In [58]:
# Set File Paths
flightPerfFilePath = "./data/departuredelays.csv"
airportsFilePath = "./data/airport-codes-na.txt"

In [59]:
# Obtain Airports dataset
airports = spark.read.csv(airportsFilePath, header='true', inferSchema='true', sep='\t')


In [60]:
airports

DataFrame[City: string, State: string, Country: string, IATA: string]

In [61]:
airports.createOrReplaceTempView("airports")

In [67]:
airports.printSchema()

root
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- IATA: string (nullable = true)



In [74]:
airports.count()

526

In [73]:
airports.select('City', 'IATA').show()

+-----------+----+
|       City|IATA|
+-----------+----+
| Abbotsford| YXX|
|   Aberdeen| ABR|
|    Abilene| ABI|
|      Akron| CAK|
|    Alamosa| ALS|
|     Albany| ABY|
|     Albany| ALB|
|Albuquerque| ABQ|
| Alexandria| AEX|
|  Allentown| ABE|
|   Alliance| AIA|
|     Alpena| APN|
|    Altoona| AOO|
|   Amarillo| AMA|
|Anahim Lake| YAA|
|  Anchorage| ANC|
|   Appleton| ATW|
|     Arviat| YEK|
|  Asheville| AVL|
|      Aspen| ASE|
+-----------+----+
only showing top 20 rows



In [62]:
# Obtain Departure Delays dataset
flightPerf = spark.read.csv(flightPerfFilePath, header='true')

In [64]:
flightPerf

DataFrame[date: string, delay: string, distance: string, origin: string, destination: string]

In [63]:
flightPerf.createOrReplaceTempView("FlightPerformance")

In [70]:
flightPerf.printSchema()

root
 |-- date: string (nullable = true)
 |-- delay: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)



In [75]:
flightPerf.count()

1391578

In [72]:
flightPerf.select("origin","destination").show()

+------+-----------+
|origin|destination|
+------+-----------+
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        DTW|
|   ABE|        ATL|
|   ABE|        ORD|
|   ABE|        DTW|
+------+-----------+
only showing top 20 rows



## SQL 

select a.City, f.origin, sum(f.delay) as Delays

  from FlightPerformance f
  
    join airports a
      on a.IATA = f.origin
      
 where a.State = 'WA'
 
 group by a.City, f.origin
 
 order by sum(f.delay) desc
 

In [77]:
# Query Sum of Flight Delays by City and Origin Code (for Washington State)
spark.sql("""select a.City, f.origin, sum(f.delay) as Delays 
                 from FlightPerformance f join airports a on a.IATA = f.origin 
                where a.State = 'WA' 
                 group by a.City, f.origin 
                 order by sum(f.delay) desc""").show()

+-------+------+--------+
|   City|origin|  Delays|
+-------+------+--------+
|Seattle|   SEA|159086.0|
|Spokane|   GEG| 12404.0|
|  Pasco|   PSC|   949.0|
+-------+------+--------+



## SQL 


select a.State, sum(f.delay) as Delays

  from FlightPerformance f
  
    join airports a
      on a.IATA = f.origin
      
 where a.Country = 'USA'
 
 group by a.State 

In [79]:
# Query Sum of Flight Delays by State (for the US)
spark.sql("""select a.State, sum(f.delay) as Delays 
                       from FlightPerformance f join airports a on a.IATA = f.origin 
                       where a.Country = 'USA' 
                       group by a.State """).show()

+-----+---------+
|State|   Delays|
+-----+---------+
|   SC|  80666.0|
|   AZ| 401793.0|
|   LA| 199136.0|
|   MN| 256811.0|
|   NJ| 452791.0|
|   OR| 109333.0|
|   VA|  98016.0|
| null| 397237.0|
|   RI|  30760.0|
|   WY|  15365.0|
|   KY|  61156.0|
|   NH|  20474.0|
|   MI| 366486.0|
|   NV| 474208.0|
|   WI| 152311.0|
|   ID|  22932.0|
|   CA|1891919.0|
|   CT|  54662.0|
|   NE|  59376.0|
|   MT|  19271.0|
+-----+---------+
only showing top 20 rows

