In [1]:
#importing findspark
import findspark
findspark.init()

In [2]:
# create a SparkSession object
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("white-house-log").getOrCreate()

In [3]:
# To assign the file location into sys.argv[1]
input_path = (r"C:\Users\C940\spark3\whitehouselog.csv")

In [4]:
# records is an RDD[String] for all of the input
records = spark.sparkContext.textFile(input_path) 

In [5]:
# count of records
records.count()

970505

In [6]:
# The first row of records
records.first()

'NAMELAST,NAMEFIRST,NAMEMID,UIN,BDGNBR,ACCESS_TYPE,TOA,POA,TOD,POD,APPT_MADE_DATE,APPT_START_DATE,APPT_END_DATE,APPT_CANCEL_DATE,Total_People,LAST_UPDATEDBY,POST,LASTENTRYDATE,TERMINAL_SUFFIX,visitee_namelast,visitee_namefirst,MEETING_LOC,MEETING_ROOM,CALLER_NAME_LAST,CALLER_NAME_FIRST,CALLER_ROOM,DESCRIPTION,Release_Date'

In [9]:
#To print out data columns/headers and have an index number to track the header index
print("The data has {} columns".format(len(records.first().split(","))))
list(enumerate(records.first().split(",")))

The data has 28 columns


[(0, 'NAMELAST'),
 (1, 'NAMEFIRST'),
 (2, 'NAMEMID'),
 (3, 'UIN'),
 (4, 'BDGNBR'),
 (5, 'ACCESS_TYPE'),
 (6, 'TOA'),
 (7, 'POA'),
 (8, 'TOD'),
 (9, 'POD'),
 (10, 'APPT_MADE_DATE'),
 (11, 'APPT_START_DATE'),
 (12, 'APPT_END_DATE'),
 (13, 'APPT_CANCEL_DATE'),
 (14, 'Total_People'),
 (15, 'LAST_UPDATEDBY'),
 (16, 'POST'),
 (17, 'LASTENTRYDATE'),
 (18, 'TERMINAL_SUFFIX'),
 (19, 'visitee_namelast'),
 (20, 'visitee_namefirst'),
 (21, 'MEETING_LOC'),
 (22, 'MEETING_ROOM'),
 (23, 'CALLER_NAME_LAST'),
 (24, 'CALLER_NAME_FIRST'),
 (25, 'CALLER_ROOM'),
 (26, 'DESCRIPTION'),
 (27, 'Release_Date')]

In [10]:
#To assign the header to be records.first() and to separate content information and header information
header = records.first()
content = records.filter(lambda x: x != header)

In [11]:
#To see the first row of content
content.first()

'TAJOURIBESSASSI,HANENE,,U22101,,VA,,,,,9/2/2015 0:00,10/1/2015 3:00,10/1/2015 23:59,,1,AR,WIN,9/2/2015 11:38,AR,Pelofsky,Eric,OEOB,226,ROWBERRY,ARIANA,,,1/29/2016'

## a) To retrieve Top 10 frequent visitors to whitehouse

In [12]:
#To filter out empty or null NAMELAST and visitee namelast column
content2 = (content.filter(lambda x: x.split(",")[0] != ''))
content3 = (content2.filter(lambda y: y.split(",")[19] != ''))

In [13]:
# To view the count of content after filtering empty or null NAMELAST column
content3.count()

911249

In [14]:
#To tokenize the record to get output key and value (K,V) pair for Namelast and Namefirst
def tokenize(record):
    tokens = record.split(",")
    NAMELAST = tokens[0].upper()
    NAMEFIRST = tokens[1].upper()
    output_key = "(" + NAMELAST + " " + NAMEFIRST + ")"
    output_val = 1
    return(output_key, output_val)

In [15]:
# Map the tokenize function to content 3 records
pairs = content3.map(tokenize)

In [16]:
#To sum the values of each name records and find the count/frequency of visitor names appearing in the data
pairs_count = pairs.reduceByKey(lambda x,y : x+y)

In [17]:
#To retrieve the top 10 visitor records in descending order
x = pairs_count.takeOrdered(10, lambda x : -x[1])

In [18]:
# To print out the header and display the top 10 visitor records
print("**    Top 10 vistors   **")
print("< Visitor > , < Frequency > :")
x

**    Top 10 vistors   **
< Visitor > , < Frequency > :


[('(KIDWELL LAUREN)', 222),
 ('(THOMAS BENJAMIN)', 196),
 ('(HARO STEVEN)', 183),
 ('(BERNER KATHERINE)', 177),
 ('(GRANT PATRICK)', 155),
 ('(HAAS JORDAN)', 152),
 ('(GARZA STEVEN)', 127),
 ('(MARTIN KATHRYN)', 122),
 ('(COHEN MANDY)', 122),
 ('(BROWN JENNIFER)', 117)]

## b) To retrieve Top 10 frequent visitee to whitehouse

In [19]:
#To tokenize the record to get output key and value (K,V) pair for visitee last name and visitee first name
def tokenize2(record):
    tokens = record.split(",")
    visitee_namelast = tokens[19].upper()
    visitee_namefirst = tokens[20].upper()
    output_key = "(" + visitee_namelast + " " + visitee_namefirst + ")"
    output_val = 1
    return(output_key, output_val)

In [20]:
# Map the tokenize2 function to content 3 records
pairs_visitee = content3.map(tokenize2)

In [21]:
#To sum the values of each name records and find the count/frequency of visitee names appearing in the data
pairs_count_visitee = pairs_visitee.reduceByKey(lambda x,y : x+y)

In [22]:
#To retrieve the top 10 visitee records in descending order
y = pairs_count_visitee.takeOrdered(10, lambda x : -x[1])

In [23]:
# To print out the header and display the top 10 visitee records
print("**    Top 10 visitee   **")
print("< Visitee > , < Frequency > :")
y

**    Top 10 visitee   **
< Visitee > , < Frequency > :


[('(OFFICE VISITORS)', 430881),
 ('(WAVES VISITORSOFFICE)', 44129),
 ('(BRYANT RUTH)', 13970),
 ('(ONEIL OLIVIA)', 13155),
 ('(THOMPSON JARED)', 11618),
 ('(/ POTUS)', 10900),
 ('(BURTON COLLIN)', 9672),
 ('(MEGAN MATTHEW)', 7944),
 ('(MAYERSON ASHER)', 6886),
 ('(DESSOURCES KALISHA)', 5289)]

## c) To retrieve the Top 10 frequent visitor-visitee combinations in whitehouse

In [24]:
#To tokenize record to get output key and value (K,V) pair for NAMELAST, NAMEFIRST visitee last name and visitee first name
def tokenize3(record):
    tokens = record.split(",")
    NAMELAST = tokens[0].upper()
    NAMEFIRST = tokens[1].upper()
    visitee_namelast = tokens[19].upper()
    visitee_namefirst = tokens[20].upper()
    output_key = "("+ NAMELAST+ " " + NAMEFIRST + "  -  " + visitee_namelast + " " + visitee_namefirst + ")"
    output_val = 1
    return(output_key, output_val)

In [25]:
# Map the tokenize3 function to content 3 records
pairs_visitor_visitee = content3.map(tokenize3)

In [26]:
#To sum the values of each name records and find the count/frequency of visitor-visitee names appearing in the data
pairs_count_vistor_visitee = pairs_visitor_visitee.reduceByKey(lambda x,y : x+y)

In [27]:
#To retrieve the top 10 visitor-visitee records in descending order
z = pairs_count_vistor_visitee.takeOrdered(10, lambda x : -x[1])

In [28]:
# To print out the header and display the top 10 visitor-visitee records
print("**  Top 10 visitor-visitee combinations   **")
print("    < Visitor - Visitee > , < Frequency > :" )
z

**  Top 10 visitor-visitee combinations   **
    < Visitor - Visitee > , < Frequency > :


[('(KIDWELL LAUREN  -  YUDELSON ALEX)', 103),
 ('(HAAS JORDAN  -  YUDELSON ALEX)', 90),
 ('(GRANT PATRICK  -  YUDELSON ALEX)', 89),
 ('(THOMAS BENJAMIN  -  YUDELSON ALEX)', 89),
 ('(HARO STEVEN  -  YUDELSON ALEX)', 84),
 ('(COHEN MANDY  -  LAMBREW JEANNE)', 84),
 ('(BERNER KATHERINE  -  YUDELSON ALEX)', 82),
 ('(ROCHE SHANNON  -  YUDELSON ALEX)', 70),
 ('(URIZAR JENNIFER  -  JOHNSON KATIE)', 68),
 ('(MARTIN KATHRYN  -  LAMBREW JEANNE)', 61)]

# d) Number of records dropped from the data

In [1]:
#Total number of records including header subtracting the total records of content3
a = records.count() - content3.count()

NameError: name 'records' is not defined

In [30]:
#output of number of records dropped
a

59256

# To stop spark session

In [31]:
#Stop spark session
spark.stop()