# RDD Review
We are reviewing some of the basic techniques for manipulating a RDD dataset.

## Word Count Revisit
### Read in a text file

In [None]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession, Row


In [None]:
spark = SparkSession.builder.appName("rdd-app").config("spark.config.option", "value").getOrCreate()
scfg = SparkConf().setAppName('rdd-app')
sc = spark.sparkContext

In [None]:
import string

text_file = '/user/student/shakespeare/tragedy/hamlet.txt'
text = sc.textFile(text_file)

In [None]:
text.collect()

### Supporting functions

In [None]:
def strip_punc(s):
    return s.translate(str.maketrans('', '', string.punctuation)).split(' ')

def search_word_in_line(word):
    count = 1
    for line in text.collect():
        if word in strip_punc(line):
            print('{}. {}'.format(count, line))
        count += 1

### Split a line into tokens separated by space (' ') after removing punctuations

In [None]:
flatmap = text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))
map = flatmap.map(lambda word: (word, 1))
reduced = map.reduceByKey(lambda a, b: a + b)

In [None]:
reduced.collect()

### Making it into a single statement

In [None]:
counts = text.flatMap(lambda line: line.translate(str.maketrans('', '', string.punctuation)).split(' '))\
             .map(lambda word: (word, 1))\
             .reduceByKey(lambda a, b: a + b)    

### Run the search

In [None]:
word = "purpose"
for count in reduced.collect():
    # kv = str(count).translate(str.maketrans('', '', string.punctuation)).split(' ')
    kv = strip_punc(str(count))
    if word == kv[0]:
        print('Found \'{}\' occurs \'{}\' times'.format(kv[0], kv[1])) 
        search_word_in_line(word)
        break

## Manipulating airline performance data

### Creating an RDD with one row.

In [None]:
airport = sc.parallelize([Row(iata="00M",airport="Thigpen ",city="Bay Springs",\
                              state="MS",country="USA",lat=31.95376472,long=-89.23450472)])
print(airport.count())
print(airport.take(3))
print(airport.collect())

### Converting an RDD to a Dataframe (DF)

In [None]:

from pyspark.sql.types import Row
from datetime import datetime

airport_df = airport.toDF()
airport_df.show()
airport_df

### More complex dataset

In [None]:

complex = sc.parallelize([Row(col_float=3.1415,
                              col_string='da pi',
                              col_boolean=True,
                              col_integer=201,
                              col_list=[1,2,3,4])])
complex.collect()

### Converting to DF

In [None]:
complex_df = complex.toDF()
complex_df.show()

### More complex data type

In [None]:
real_complex = sc.parallelize([
    Row(col_list=[1,2,3], col_dict = {"pi": 3.1415}, col_row = Row(number=3, fraction=1415), col_time=datetime(2019,7,22,5,51,0)),
    Row(col_list=[3,4,5], col_dict = {"sqrt2": 1.4142}, col_row = Row(number=1, fraction=4142), col_time=datetime(2019,7,22,5,54,0)),
    Row(col_list=[6,7,9,10], col_dict = {"sqrt3": 1.73205}, col_row = Row(number=1, fraction=73205), col_time=datetime(2019,7,22,5,55,0))
])
real_complex.collect() # A little bit hard to see

In [None]:
real_complex_df = real_complex.toDF()
real_complex_df.show();

**It is much easier to view the data structure now**

## Airline Performance data
Loading data from HDFS

In [None]:
data_by_year = '/user/student/airline/1987.csv'
airline_performance = spark.read.option("header", "true").csv(data_by_year)

In [None]:
airline_performance.show()

### Loading airport table

In [None]:

airports_file = '/user/student/airline/airports.csv'
airports = spark.read.option("header", "true").csv(airports_file)
airports.show()

### Airports is a DF data type

In [None]:
airports

In [None]:
airports.count()

In [None]:
airports.collect()

In [None]:
airports.take(5)

In [None]:
airports.first()

In [None]:
airports.head(5)

In [None]:
# Accessing rows
airports.collect()[2]

In [None]:
airports.collect()[2]['state'] # use column name.

In [None]:
airports.collect()[2][3] # use column index

In [None]:
airport_rdd = airports.rdd.map(lambda x: (x.iata, x.airport, x.city, x.state, x.country, x.lat, x.long))
airport_rdd.collect()

In [None]:
# More selective
airport_rdd = airports.rdd.map(lambda x: (x.iata, x.airport))
airport_rdd.collect()

In [None]:
# default with col names.
airport_rdd = airports.rdd
airport_rdd.collect()

In [None]:
airports.describe(['lat']).show()