In [None]:
# Take-Home Coding Assessment

# Task 1: Description: Write a program that reads a file and finds matches against a predefined set of words. There can be up to 10K entries in the list of predefined words.
# Requirement details:
# Input file is a plain text (ascii) file, every record separated by a new line.
# For this exercise, assume English words only
# The file size can be up to 20 MB
# The predefined words are defined in a text file, every word separated by a newline. Use a sample file of your choice for the set of predefined keywords for the exercise.
# Thanks,

In [59]:
# With basic python operations

import time

startTime = time.time()
predefinedWords = {}

#reading the predefined words into a dictionary for easy access. 
# For, a system holding predefined data in GB/TB, we can use a distrbuted cache to achieve this purpose.
with open('predefined_words.txt','r') as file:
    lines = file.readlines()
    for line in lines:
      predefinedWords[line.strip()] = 1
# print(len(predefinedWords))

# reading the input Data and matching the words with the predefined ones in the dictionary
# Here, I am reading all lines at once since my system is capable of holding the data in memory. but, if we are gonna
# deal with much large data sets, its better to read and write data in batches. so, that each batch fits into the memory.
matched_word_cnt = 0
matched_words_str = ''
with open('input.txt','r') as inputFile:
    lines = inputFile.readlines()
    for line in lines:
      word = line.strip()
      if(word in predefinedWords):
        matched_words_str+=word+'\n'
        matched_word_cnt+=1
    inputFile.close()

# Writing the output data to the file
with open('output.txt','w') as outputFile:
    outputFile.write(matched_words_str)
    outputFile.close()
    
endTime = time.time()
print('Exec Time:', endTime-startTime)
print('Matched words:', matched_word_cnt)

Exec Time: 0.42473292350769043
Matched words: 27044


In [58]:
pip install pyspark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [69]:
# With Spark

from pyspark.sql import SparkSession
import time

# creating a spark session in local
startTime = time.time()
predefinedFile = 'predefined_words.txt'
inputFile = "input.txt"
spark = SparkSession.builder.master('local[*]').appName("Illumio-Assessemnt").getOrCreate()

# reading the data for both the files using spark
inputData = spark.read.text(inputFile)
predefinedData = spark.read.text(predefinedFile)

# inputData.show()
# predefinedData.show()

# renaming and selecting the word data
input_data_table = inputData.select(inputData.value.alias("inputWord"))
predefined_data_table = predefinedData.select(predefinedData.value.alias("predefinedWord"))

# join operation to get the input words which exist in the predefined data
matchedWords = input_data_table.join(predefined_data_table).where("inputWord == predefinedWord").select('inputWord')

# writing back to the file
matchedWords.write.text('output-spark')

endTime = time.time()
print('Exec Time:', endTime-startTime)
print('Matched words:', matchedWords.count())
spark.stop()

Exec Time: 1.1681132316589355
Matched words: 27044
