# [[HW 1_1 SparkSQL 숙제]]

# HDFS 실행 (MASTER_IP:50070로 실행 확인)

In [1]:
%%bash
start-dfs.sh

Starting namenodes on [master]
master: namenode is running as process 11617.  Stop it first.
Starting datanodes
worker2: datanode is running as process 9439.  Stop it first.
worker1: datanode is running as process 9439.  Stop it first.
worker3: datanode is running as process 9422.  Stop it first.
Starting secondary namenodes [master]
master: secondarynamenode is running as process 11867.  Stop it first.


# Wikipedia data가 HDFS에 올라와 있는지 확인

In [2]:
%%bash
# List the uploaded files in the HDFS directory
hdfs dfs -ls "hdfs://master:9000/wiki"

ls: `hdfs://master:9000/wiki': No such file or directory


# 만약 올라와 있지 않다면, 데이터 업로드

In [3]:
%%bash
# Upload the files from the local disk to HDFS
hdfs dfs -put "/home/ubuntu/spark_inputs/" hdfs://master:9000/wiki

# (혹시 실수로 데이터 업로드 두번 하셨다면, 다음 커맨드를 사용해서 중복 데이터를 제거해주세요)

In [None]:
%%bash
# hdfs dfs -rm -R "hdfs://master:9000/wiki/spark_inputs"

# 스파크 분산환경 셋팅 (MASTER_IP:8080으로 실행 확인)

In [4]:
%%bash
echo -e 'worker1\nworker2\nworker3' > /home/ubuntu/spark_scripts/workers
/home/ubuntu/spark_scripts/start_cluster.sh

starting org.apache.spark.deploy.master.Master, logging to /home/ubuntu/spark/logs/spark-ubuntu-org.apache.spark.deploy.master.Master-1-master.out
[1] 09:37:23 [SUCCESS] worker1
starting org.apache.spark.deploy.worker.Worker, logging to /home/ubuntu/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-worker1.out
[2] 09:37:23 [SUCCESS] worker3
starting org.apache.spark.deploy.worker.Worker, logging to /home/ubuntu/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-worker3.out
[3] 09:37:23 [SUCCESS] worker2
starting org.apache.spark.deploy.worker.Worker, logging to /home/ubuntu/spark/logs/spark-ubuntu-org.apache.spark.deploy.worker.Worker-1-worker2.out


# 스파크 실행 (MASTER_IP:4040으로 실행 확인)

In [5]:
import findspark
import os

# Required to import pyspark
findspark.init('/home/ubuntu/spark')

import pyspark

# Set executor configurations
sparkconf = pyspark.SparkConf().set('spark.executor.memory', '2g')

# Deploy Spark executors!!
ss = pyspark.sql.SparkSession.builder.appName("DS2").master("spark://master:7077").config(conf=sparkconf).getOrCreate()

# 스파크 Dataframe 생성

In [6]:
# Use SparkContext for mapreduce (not SparkSession)
sc = ss.sparkContext

# Read the data from HDFS
lines = sc.textFile("hdfs://master:9000/wiki")

# Split each 'line' into columns
columns = lines.map(lambda line: tuple(line.split(" ")))

# Create a Spark DataFrame (equivalent of a 'SQL table' in Spark)
df = ss.createDataFrame(columns, ['project', 'title', 'count', 'size'])

In [15]:
import pandas as pd


# 숙제 SQL query 실행 및 결과물 콘솔에 프린트

In [33]:
# df.createOrReplaceTempView("....")
# selected = ss.sql("....")
# selected.show()
df.createOrReplaceTempView('wiki') # create temp view
no1 = ss.sql("select project,title,count \
            from wiki \
            where project='de' and title<>'Woodkid' and count >=800 and count<1000 \
            order by count desc")
no1.show()

+-------+--------------------+-----+
|project|               title|count|
+-------+--------------------+-----+
|     de|Wikipedia:Auskunf...|  900|
|     de|Spezial:Beobachtu...|  882|
|     de|Spezial:Beobachtu...|  804|
+-------+--------------------+-----+



In [31]:
ownertable_raw = [('Woodkid','Lila'),('Sia','Jane'),('Ryuichi_Sakamoto','Sam')]
ownerrdd = sc.parallelize(ownertable_raw)
ownerdf = ss.createDataFrame(ownerrdd,['title','owner'])
ownerdf.show()

+----------------+-----+
|           title|owner|
+----------------+-----+
|         Woodkid| Lila|
|             Sia| Jane|
|Ryuichi_Sakamoto|  Sam|
+----------------+-----+



In [34]:
ownerdf.createOrReplaceTempView('owner')
no2 = ss.sql('select owner,avg(count) as avg_count\
             from wiki\
             natural join owner\
             group by owner')
no2.show()

+-----+---------+
|owner|avg_count|
+-----+---------+
| Lila|      2.3|
|  Sam|     14.0|
| Jane|     10.0|
+-----+---------+

