In [1]:
from pyspark import  SparkContext
import re
import requests

In [2]:
def removePunctuation_tokenize(text):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces, then split by white space

    Note:
        Only whitespace, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        text (str): A string.

    Returns:
        str: The cleaned up string.
    """
    return re.sub(r'[^A-Za-z\s\d]',r'',text).strip().lower().split()


  

  

In [3]:
removePunctuation_tokenize("   I lobe's tijasj  kkk!!")  ## test the removePunctuation_tokenize function to make sure it works

In [4]:
## read in three partial books file as well the query text from S3, load into rdd.  Query are texts taken from the book RiderHaggard_SheandAllan with alterations, including words skip and words typos.  
r1 = requests.get('https://s3-us-west-1.amazonaws.com/datasw/GeorgeJames_WhiteRace.txt')
r2 = requests.get('https://s3-us-west-1.amazonaws.com/datasw/RiderHaggard_SheandAllan.txt')
r3 = requests.get('https://s3-us-west-1.amazonaws.com/datasw/StanleyMatthews_DoubleTrouble.txt')
q=requests.get('https://s3-us-west-1.amazonaws.com/datasw/Query.txt')
Book_GJ=[line for line in r1.iter_lines()]
Book_RH=[line for line in r2.iter_lines()]
Book_SM=[line for line in r3.iter_lines()]
query=[line for line in q.iter_lines()]
BookGJ_RDD = sc.parallelize(Book_GJ)
BookRH_RDD = sc.parallelize(Book_RH)
BookSM_RDD = sc.parallelize(Book_SM)
Q_RDD=sc.parallelize(query)

In [5]:
BookGJ_RDD.take(5)
BookRH_RDD.take(5)
BookSM_RDD.take(5)
Q_RDD.take(5)


In [6]:
## transform each book RDD using removePunctuation_tokenize function
BookGJ_RDD_transformed=BookGJ_RDD.flatMap(removePunctuation_tokenize)
BookRH_RDD_transformed=BookRH_RDD.flatMap(removePunctuation_tokenize)
BookSM_RDD_transformed=BookSM_RDD.flatMap(removePunctuation_tokenize)
Q_RDD_transformed=Q_RDD.flatMap(removePunctuation_tokenize)

In [7]:
BookGJ_RDD_transformed.count()
BookRH_RDD_transformed.count()
BookSM_RDD_transformed.count()
Q_RDD_transformed.count()

In [8]:

BookGJ_RDD_transformed.take(10)
BookRH_RDD_transformed.take(10)
BookSM_RDD_transformed.take(10)
Q_RDD_transformed.take(10)

In [9]:
##  now we have both books RDD and query RDD ready,  we need build a ngram for each of them, in order to trace which book the query is taken from.  We do have noises in the query, so the idea here is to find the book sources that have the maxium count of ngram matching with the query.  I used trigram here.

input_list = ['all', 'this', 'happened', 'more', 'or', 'less']

def find_trigrams(input_list):
  return zip(input_list, input_list[1:],input_list[2:])

find_trigrams(input_list)

In [10]:
BookGJ_trigram=find_trigrams([x for x in BookGJ_RDD_transformed.toLocalIterator()])
BookRH_trigram=find_trigrams([x for x in BookRH_RDD_transformed.toLocalIterator()])
BookSM_trigram=find_trigrams([x for x in BookSM_RDD_transformed.toLocalIterator()])
Q_trigram=find_trigrams([x for x in Q_RDD_transformed.toLocalIterator()])
#print BookGJ_trigram
print Q_trigram



In [11]:
### now find the overlap of the ngram between query and the book sources

def Overlap_count(query,book):
  return(len([x for x in query if x in book]))

In [12]:
print 'Query Trigrams found \n in Book_GJ is : %d \n in Book_SM is %d \n in Book_RH is %d'%(Overlap_count(Q_trigram,BookGJ_trigram),Overlap_count(Q_trigram,BookSM_trigram),Overlap_count(Q_trigram,BookRH_trigram))


## we can see it makes correct prediction, the query is taken from book RH(RiderHaggard_SheandAllan) with alterations, including words skip and words typos

In [13]:
### now we start working a separate task, to predict the author of the query when the query was not seen before, so we read in a new text query extracted from the author RiderHaggard, but not included in the book RH(RiderHaggard_SheandAllan) we previously readed in
