In [1]:
from pyspark import SparkConf, SparkContext 
import collections
from dataclasses import dataclass, field
from awsglue.context import GlueContext

conf = SparkConf().setMaster("local").setAppName("udemy")
sc = SparkContext.getOrCreate(conf=conf)
glueContext = GlueContext(sc)
spark = glueContext.spark_session

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.0 
Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::441656060858:role/AWSGlueServiceRole-thanhtt-datalake
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: d5ef52a9-abcd-4871-99df-71c5bd827c7b
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.0
--enable-glue-datacatalog true
Waiting for session d5ef52a9-abcd-4871-99df-71c5bd827c7b to get into ready status...
Session d5ef52a9-abcd-4871-99df-71c5bd827c7b has been created.



# RDD: mapValue and reduceByKey function

In [2]:
lines = sc.textFile("s3://thanhtt-0000-datalake/udemy/ml-100k/u.data")




In [3]:
ratings = lines.map(lambda x: x.split()[2])




In [4]:
result = ratings.countByValue()




In [5]:
sortedResults = collections.OrderedDict(sorted(result.items()))
for key, value in sortedResults.items():
    print("%s %i" % (key, value))

1 6110
2 11370
3 27145
4 34174
5 21201


def parseLine(line):
    fields = line.split(',')
    age = int(fields[2])
    numFriends = int(fields[3])
    return (age, numFriends)

lines = sc.textFile("s3://thanhtt-0000-datalake/udemy/SparkCourse/fakefriends.csv")
rdd = lines.map(parseLine) 

totalsByAge = rdd.mapValues(lambda x:(x, 1)).reduceByKey(lambda x,y: (x[0]+y[0],x[1]+y[1])) 

totalsByAgeCollected = totalsByAge.collect() 

averagesByAge = totalsByAge.mapValues(lambda x:x[0]/x[1]) 

results = averagesByAge.collect()
  
for result in results:
   print(result)

# RDD: filter function

def parseLine(line):
    fields = line.split(',')
    stationID = fields[0]
    entryType = fields[2]
    temperature = float(fields[3]) * 0.1 * (9.0 / 5.0) + 32.0
    return (stationID, entryType, temperature)

lines = sc.textFile("s3://thanhtt-0000-datalake/udemy/SparkCourse/1800.csv")
parsedLines = lines.map(parseLine)
minTemps = parsedLines.filter(lambda x: "TMIN" in x[1])
stationTemps = minTemps.map(lambda x: (x[0], x[2]))
minTemps = stationTemps.reduceByKey(lambda x, y: min(x,y))
results = minTemps.collect();

for result in results:
    print(result[0] + "\t{:.2f}F".format(result[1]))


# RDD: flatMap function

import re

def normalizeWords(text):
    return re.compile(r'\W+',re.UNICODE).split(text.lower())

input = sc.textFile("s3://thanhtt-0000-datalake/udemy/SparkCourse/book.txt")
words = input.flatMap(normalizeWords) # the text split into many rows

wordCounts = words.map(lambda x: (x,1)).reduceByKey(lambda x,y: x+y)
wordCountsSorted = wordCounts.map(lambda x: (x[0],x[1])).map(lambda x:(x[1],x[0]) ).sortByKey()
# for item in wordCountsSorted.collect():# comment because the list is too long
    # print(item)
print("Result is commented because it's too long")
results = wordCountsSorted.collect()

# for result in results:
#     count = str(result[0]) 
#     word = result[1].encode("ascii","ignore")
    
#     # comment because the list is too long
#     if(word):
#         print(cleanWord,count)


# **DataFrame: Spark SQL**

from pyspark.sql import Row

def mapper(line):
    fields = line.split(',')
    return Row(ID = int(fields[0]), name = str(fields[1].encode("utf-8")),
               age = int(fields[2]), numFriends = int(fields[3]))

lines = sc.textFile("s3://thanhtt-0000-datalake/udemy/SparkCourse/fakefriends.csv")
people = lines.map(mapper)

# Infer the schema, and register the DataFrame as table.
schemaPeople = spark.createDataFrame(people).cache()
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been register as a table
teenagers = spark.sql("SELECT * FROM people WHERE age >=13 and age <=19")

# The results of SQL queries are RDDs and support all the normal RDD operations
for teen in teenagers.collect():
    print(teen)
    
# We can also use functions instead of SQL queries
schemaPeople.groupBy("age").count().orderBy("age").show()


# **DataFrame: Infer Schema and common function to work with**

from pyspark.sql import functions as func

people = spark.read.option("header","true").option("inferSchema","true")\
            .csv("s3://thanhtt-0000-datalake/udemy/SparkCourse/fakefriends-header.csv")

print("here is our inferred schema")
people.printSchema()

print("display the name column")
people.select("name").show()

print("filter out anyone over 21")
people.filter(people.age < 21).show()

print("group by age")
people.groupBy("age").count().show()

print("make everyone 10 year older")
people.select(people.name, people.age +10).show()

print("sorted")
friendByAge = people.select("age","friends")
friendByAge.groupBy("age").avg("friends").sort("age").show()

print("formatted more nicely")
friendByAge.groupBy("age").agg(func.round(func.avg("friends"),2)).sort("age").show()

print("with a custom column name")
friendByAge.groupBy("age").agg(func.round(func.avg("friends"),2).alias("friends_avg")).sort("age").show()


# **Word count with DataFrame(split unstructure text into multi row dataframe)**

# read each line of my book into a dataframe
inputDF = spark.read.text("s3://thanhtt-0000-datalake/udemy/SparkCourse/book.txt")
inputDF.show()

# split using a regular expression that extract words
words = inputDF.select(func.explode(func.split(inputDF.value,"\\W+")).alias("word"))
words.filter(words.word != "")

words.show()