# Spark Core API

In [23]:
data = [('Ailce',1),('Bob',2),('Charlie',3)]
type(data)

list

In [24]:
data1 = spark.createDataFrame(data, ['Name','Value']) # spark의 분산객체: 데이터프레임 형식
data1[0][1]

Column<'Name[1]'>

In [25]:
data1.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Ailce|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [26]:
data1.filter(data1.Name == 'Bob').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [27]:
data1.filter(data1.Value > 2).show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [34]:
data1.createOrReplaceTempView('people')

In [35]:
spark.sql("SELECT * FROM people WHERE Name = 'Bob'").show()


+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



### RDD 객체 생성

In [71]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('pyspark example1').getOrCreate()
rdd = spark.sparkContext.parallelize([1,2,3,4,5])
rdd

ParallelCollectionRDD[1] at readRDDFromFile at PythonRDD.scala:289

In [72]:
rdd.take(5) #rdd 객체를 출력하는 함수 -n개를 지정

[1, 2, 3, 4, 5]

In [73]:
spark

### map 연산

In [9]:
squared_rdd = rdd.map(lambda x:x*x)
squared_rdd

PythonRDD[11] at RDD at PythonRDD.scala:53

In [10]:
rdd.take(3)

[1, 2, 3]

In [11]:
squared_rdd.take(3)

[1, 4, 9]

In [12]:
squared_rdd.collect()

[1, 4, 9, 16, 25]

# MLib(머신러닝 알고리즘과 도구 모음)

In [49]:
from pyspark.ml.regression import LinearRegression

In [51]:
import numpy as py

In [52]:
from pyspark.ml.feature import VectorAssembler

In [53]:
data_age = [('Ailce',25),('Bob',30),('Charlie',33)]

In [54]:
data2 = spark.createDataFrame(data_age, ['Name','Age']) # spark의 분산객체: 데이터프레임 형식x
data2.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Ailce| 25|
|    Bob| 30|
|Charlie| 33|
+-------+---+



In [55]:
assembler  = VectorAssembler(inputCols = ['Age'], outputCol = 'features')
vector_df =  assembler.transform(data2)
vector_df

DataFrame[Name: string, Age: bigint, features: vector]

In [58]:
lr = LinearRegression(featuresCol='features', labelCol='Age')
model = lr.fit(vector_df)

In [60]:
pred = model.transform(vector_df)
pred.show()

+-------+---+--------+-----------------+
|   Name|Age|features|       prediction|
+-------+---+--------+-----------------+
|  Ailce| 25|  [25.0]|24.99999999999993|
|    Bob| 30|  [30.0]|30.00000000000001|
|Charlie| 33|  [33.0]|33.00000000000006|
+-------+---+--------+-----------------+



In [61]:
spark.stop()

# Streaming(실시간 데이터 스트리밍 처리)

In [63]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

In [64]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() # chaining

In [66]:
lines = spark.readStream.format('socket') \
    .option('host', 'localhost') \
    .option('port', 9999) \
    .load() #스트리밍 데이터


In [67]:
words = lines.select(explode(split(lines.value, ' ')).alias('word'))

In [68]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x7fc9696e9c50>>