# 4. Create spark RDD from external source file


### a. Identify any external data sources to create RDD (E.g. txt, csv, json etc.)

In [14]:
import findspark
findspark.init()

In [15]:
import pyspark

In [16]:
from pyspark.sql import SparkSession

In [32]:
spark= SparkSession.builder.appName('Emp').getOrCreate()

In [33]:
MY = spark.read.csv("Employee.csv",inferSchema = True,header = True).rdd
type(MY)

pyspark.rdd.RDD

In [26]:
MY.collect()

[Row(_c0='First_Name', _c1='Last_Name', _c2='Gender', _c3='Salary'),
 Row(_c0='James', _c1='Smith', _c2='M', _c3='30'),
 Row(_c0='Anna', _c1='Rose', _c2='F', _c3='41'),
 Row(_c0='Robert', _c1='Williams', _c2='M', _c3='62'),
 Row(_c0='Suzan', _c1='Khan', _c2='F', _c3='31'),
 Row(_c0='John', _c1='Abram', _c2='M', _c3='38'),
 Row(_c0='Sunny', _c1='Nene', _c2='F', _c3='40')]

In [27]:
MY.count()

7

### b. View RDD and limit only top 5 rows

In [29]:
MY.take(5)

[Row(_c0='First_Name', _c1='Last_Name', _c2='Gender', _c3='Salary'),
 Row(_c0='James', _c1='Smith', _c2='M', _c3='30'),
 Row(_c0='Anna', _c1='Rose', _c2='F', _c3='41'),
 Row(_c0='Robert', _c1='Williams', _c2='M', _c3='62'),
 Row(_c0='Suzan', _c1='Khan', _c2='F', _c3='31')]

In [30]:
MY.count()

7

### c. Convert RDD to Data Frame

In [34]:
MY = MY.toDF()
MY.show()

+----------+---------+------+------+
|First_Name|Last_Name|Gender|Salary|
+----------+---------+------+------+
|     James|    Smith|     M|    30|
|      Anna|     Rose|     F|    41|
|    Robert| Williams|     M|    62|
|     Suzan|     Khan|     F|    31|
|      John|    Abram|     M|    38|
|     Sunny|     Nene|     F|    40|
+----------+---------+------+------+



In [36]:
type(MY)

pyspark.sql.dataframe.DataFrame

### d. Show Data Frame top 5 rows

In [37]:
MY.show(5)

+----------+---------+------+------+
|First_Name|Last_Name|Gender|Salary|
+----------+---------+------+------+
|     James|    Smith|     M|    30|
|      Anna|     Rose|     F|    41|
|    Robert| Williams|     M|    62|
|     Suzan|     Khan|     F|    31|
|      John|    Abram|     M|    38|
+----------+---------+------+------+
only showing top 5 rows



### e. Continue with Text file from above point 
#### a  take new data source as txt file and read into RDD

In [38]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()
sc=spark.sparkContext

In [41]:
# Read Text File
df = sc.textFile("test.txt")

In [44]:
df.take(5)

['First line: This is a test file',
 'This is second line',
 'third line',
 'and subsiquent lines..',
 'and more..']

### f. Count repetitive word from text file.

In [45]:
counts = df.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)

In [46]:
# Printing each word with its respective count
output = counts.collect()
for (word, count) in output:
    print("%s: %i" % (word, count))

is: 2
test: 1
line: 2
third: 1
subsiquent: 1
First: 1
line:: 1
This: 2
a: 1
file: 1
second: 1
and: 2
lines..: 1
more..: 1


### g. Show lines in txt file which has specific word in it.

In [47]:
user_input = input("enter the word: ")
file = open("test.txt","r")

for line in file:
    if user_input in line:
        print(line)

enter the word: line
First line: This is a test file

This is second line

third line

and subsiquent lines..



### h. Use filter and show result.

filter() function is used to filter the rows from RDD/DataFrame based on the given condition 

In [None]:
load_data=sc.textFile("E://long_sample.txt").filter(lambda x : "title")

In [49]:
rdd_1= sc.parallelize([78, 2, 3, 4, 6])
rdd_1.filter(lambda x: x % 2 == 0).collect()

[78, 2, 4, 6]