# Topic modelling with n-Grams

In [2]:
import re 
from operator import add


def stripFinalS( word ):
    word = word.lower() # lower case
    if len(word) >0 and word[-1] == 's': # check for final letter
        return word[:-1]
    else:
        return word
    
def splitFileWords(filenameContent):
    f,c = filenameContent # split the input tuple  
    fwLst = [] # the new list for (filename,word) tuples
    wLst = re.split('\W+',c) # <<< now create a word list wLst
    for w in wLst: # iterate through the list
        fwLst.append((f,stripFinalS(w))) # <<< and append (f,w) to the 
    return fwLst #return a list of (f,w) tuples 

def hashing_vectorizer(word_count_list, N):
     v = [0] * N  # create fixed size vector of 0s
     for word_count in word_count_list:          word,count = word_count 	# unpack tuple
         h = hash(word) # get hash value
         v[h % N] = v[h % N] + count # add count
     return v # return hashed word vector

def reGrpLst(fw_c): # we get a nested tuple
    fw,c = fw_c
    f,w = fw
    return (f,[(w,c)]) # return (f,[(w,c)]) structure. 

N = 10

#dirPath = 'hdfs://saltdean/data/library/'
dirPath = '../../Data/library/'
ft_RDD = sc.wholeTextFiles(dirPath) #<<< add code to create an RDD with wholeTextFiles


## 1) Create N-grams and n-gram frequency vectors

In [3]:

# Using a nested list comprehension to create n-grams given a string.
def splitNGrams(text,n): # function for splitting a word list and creating n-grams
    nGramLst = [] # the new list for (filename,word) tuples
    wLst = re.split('\W+', text) # now create a word list wLst
    wLst = list(map(stripFinalS,wLst)) # remove final s from the word list (this is a local map, don't confuse with RDD or DF map)
    wNum = len(wLst) # get total lenght to avoid overrunning at the end.
    nGramLst = [' '.join(wLst[i:j]) for i in range(0,wNum) for j in range(i+1,min(wNum,i+n+1))]
    return nGramLst #return a list of (f,w) tuples 

# Alternative version with separate function for converting a word-list to n-grams
def lst2ngram(wLst,n):
    wNum = len(wLst) # get total lenght to avoid overrunning at the end.
    nGramLst = [] # output list
    for i in range(0,wNum): # starting points 
        for j in range(i+1,min(wNum,i+n+1)): # end points of n-grams 
              nGramLst.append(' '.join(wLst[i:j])) # append to list
    return nGramLst # done

# a wrapper around the separate function
def splitNGrams2(text,n): 
    wLst = re.split('\W+', text) #  split into words
    nGramLst = lst2ngram(wLst,n) # create the n-grams
    return nGramLst # done

# This function manages the filenames around the n-gram extraction  
def splitFileNGrams(filenameContent,n=2): 
    f,c = filenameContent # split the input tuple 
    ngLst = splitNGrams(c,n) # split the file content into n-grams
    fngLst = [] # the new list for (filename,n-gram) tuples 
    for ng in ngLst: # iterate through the list
        fngLst.append((f,ng)) # and append (f,ng) to the 
    return fngLst #return a list of (f,ng) tuples 

# just for testing
print(splitNGrams('a b c d e f g', 3)) # test the splitting function with s string (easier than with an RDD or DF)
print(splitNGrams2('a b c d e f g', 3))# test the 2nd version, should look like the first
print(splitFileNGrams(('file','a b c d e f g'), 3)) # should add the file tag before the n-grams

# now let's use the new function to create RDDs with n-gram vectors
from functools import partial
# use a partial to define the max len of the n-grams
fng_RDD = ft_RDD.flatMap(partial(splitFileNGrams,n=2)) 
fng_RDD.take(5)
fng_1_RDD = fng_RDD.map(lambda x: (x,1))  # change (f,ng) to ((f,ng),1)
fng_c_RDD = fng_1_RDD.reduceByKey(add) # add the ones
f_ngcL_RDD = fng_c_RDD.map(reGrpLst) # regroup to (f,[(ng,c)])
f_ngcL2_RDD = f_ngcL_RDD.reduceByKey(add) #<<< create [(w,c), ... ,(w,c)] lists per file
f_ngVec_RDD = f_ngcL2_RDD.map(lambda f_wc: (f_wc[0],hashing_vectorizer(f_wc[1],N)))
f_ngVec_RDD.take(3)

['a', 'a b', 'a b c', 'b', 'b c', 'b c d', 'c', 'c d', 'c d e', 'd', 'd e', 'd e f', 'e', 'e f', 'f']
['a', 'a b', 'a b c', 'b', 'b c', 'b c d', 'c', 'c d', 'c d e', 'd', 'd e', 'd e f', 'e', 'e f', 'f']
[('file', 'a'), ('file', 'a b'), ('file', 'a b c'), ('file', 'b'), ('file', 'b c'), ('file', 'b c d'), ('file', 'c'), ('file', 'c d'), ('file', 'c d e'), ('file', 'd'), ('file', 'd e'), ('file', 'd e f'), ('file', 'e'), ('file', 'e f'), ('file', 'f')]


[('file:/Users/tweyde/Documents/CityUni/teaching/2017-18/Big Data/Data/library/emma.txt',
  [40842, 36770, 26182, 32390, 31629, 31362, 29476, 42068, 23837, 35695]),
 ('file:/Users/tweyde/Documents/CityUni/teaching/2017-18/Big Data/Data/library/henry_V.txt',
  [6283, 6095, 5555, 5891, 6033, 6137, 5027, 7704, 4921, 6455]),
 ('file:/Users/tweyde/Documents/CityUni/teaching/2017-18/Big Data/Data/library/king_lear.txt',
  [7083, 6317, 5537, 6536, 6413, 5646, 5243, 7217, 5062, 5831])]

## 2) Convert to DataFrame

The next task is create a DataFrame from the RDD. This is similar to what was shown in the lecture and also to the docuementation: [http://spark.apache.org/docs/2.0.0/sql-programming-guide.html#interoperating-with-rdds](http://spark.apache.org/docs/2.0.0/sql-programming-guide.html#interoperating-with-rdds)  


In [6]:
austen = ['senseandsensibility.txt','mansfield_park.txt','emma.txt','persuasion.txt','northanger_abbey.txt','lady_susan.txt', 
    'prideandpredjudice.txt']
#austen = ['hdfs://saltdean/data/library/'+s for s in austen] 
austen = ['../../Data/library/'+s for s in austen] 
#print(austen)

av_RDD = f_ngVec_RDD.map(lambda f_wVec: ('Austen' if (f_wVec[0] in austen) else 'Shakespeare',f_wVec[1])) 

from pyspark.sql import Row
    
row_RDD = av_RDD.map(lambda av: Row(author=av[0], vector=av[1])) 

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
# Create a dataFrame from the RDD
library_DF = spark.createDataFrame(row_RDD)
library_DF.createOrReplaceTempView("library")
print("library_DF.printSchema()")
library_DF.printSchema()
print("library_DF.show()")
library_DF.show()
#print("library_DF.describe()")
#library_DF.describe()

# SQL can be used over DataFrames that have been registered as a table.
SQL1 = "SELECT author,vector FROM library WHERE author=='Austen'"
austen_vectors = spark.sql(SQL1)
print(SQL1)
austen_vectors.show()

SQL2 = "SELECT author,vector FROM library WHERE author!='Austen'"
other_vectors = spark.sql(SQL2)
print(SQL2)
print(other_vectors.collect())

library_DF.printSchema()
root
 |-- author: string (nullable = true)
 |-- vector: array (nullable = true)
 |    |-- element: long (containsNull = true)

library_DF.show()
+-----------+--------------------+
|     author|              vector|
+-----------+--------------------+
|Shakespeare|[5486, 7575, 2808...|
|Shakespeare|[863, 1502, 550, ...|
|Shakespeare|[1119, 1299, 551,...|
|Shakespeare|[620, 918, 389, 7...|
|Shakespeare|[390, 954, 441, 6...|
|Shakespeare|[2578, 3533, 1383...|
|Shakespeare|[836, 1320, 618, ...|
|Shakespeare|[4003, 5661, 2040...|
|Shakespeare|[1333, 1622, 576,...|
|Shakespeare|[700, 1047, 374, ...|
|Shakespeare|[964, 1071, 400, ...|
|Shakespeare|[5422, 8105, 2580...|
|Shakespeare|[580, 1080, 536, ...|
|Shakespeare|[2710, 4300, 1509...|
|Shakespeare|[1340, 1516, 551,...|
|Shakespeare|[1000, 1251, 424,...|
|Shakespeare|[999, 1251, 424, ...|
|Shakespeare|[4042, 5426, 2028...|
|Shakespeare|[912, 1076, 462, ...|
+-----------+--------------------+

SELECT author,vector FRO

## Make DataFrames from the newsgroups dataset

To use the newsgroups dataset we need to parse the messages by 

In [9]:
import re 

#dirPath1 = '/data/tempstore/newsgroups/alt.atheism'
#dirPath2 = '/data/tempstore/newsgroups/comp.graphics'

dirPath1 = '../../Data/20_newsgroups/alt.atheism'
dirPath2 = '../../Data/20_newsgroups/comp.graphics'


N  = 100 # vector size
NG =  1 # max n-gram size

# remove the headers, get the sender and the main text
def parseMessage(ft): 
    fn,text = ft # unpack the filename and text content 
    # Use a regular expression to match the text
    matchObj = re.search(r'.+^(Lines:|NNTP-Posting-Host:) (.*)', text,re.DOTALL|re.MULTILINE) 
    if(matchObj): # only if the pattern has matched 
        text = matchObj.group(2) # can we replace the text 
    else:
        return None # otherwise we don't return anything
    return (fn,text)

# we need a SparkSession to create DataFrames
spark = SparkSession.builder.getOrCreate()

# we need to create our feature vecotrs using the pyspark.ml.linalg.Vectors class,
# in order to use the CrossValidation later.
from pyspark.ml.linalg import Vectors

# Make a DataFrame with labels and n-Gram vectors 
def make_dataFrame(dirPath, argLabel):
    print("make_dataFrame started")
    ft_RDD = sc.wholeTextFiles(dirPath) # add code to create an RDD with wholeTextFiles
    ft2_RDD = ft_RDD.map(parseMessage) # get (file,text) 
    #print("ft2_RDD.take(2)", ft2_RDD.take(2))
    fng_RDD = ft2_RDD.flatMap(partial(splitFileNGrams,n=NG)) # 
    #print("fng_RDD.take(2)", fng_RDD.take(2))
    print(fng_RDD.count())
    fng2_RDD = fng_RDD.filter(lambda x: x is not None)
    print("fng2_RDD.count()", fng2_RDD.count())
    fng_1_RDD = fng2_RDD.map(lambda x: (x,1))  # change (fs,ng) to ((fs,ng),1) - fs is actually a tuple, but we ignore that here
    fng_c_RDD = fng_1_RDD.reduceByKey(add) #as above
    f_ngcL_RDD = fng_c_RDD.map(reGrpLst) #as above
    f_ngcL2_RDD = f_ngcL_RDD.reduceByKey(add) #<<< create [(w,c), ... ,(w,c)] lists per file 
    f_ngVec_RDD = f_ngcL2_RDD.map(lambda f_wc: (f_wc[0],hashing_vectorizer(f_wc[1],N)))
    # create Row objects with Vectors, as required by the algorithms in the 'ml' package.
    rows_RDD = f_ngVec_RDD.map(lambda f_ngVec: Row( features=Vectors.dense(f_ngVec[1]), weight=1.0, label=float(argLabel)))
    rows_DF = spark.createDataFrame(rows_RDD) # create a DataFrame
    return rows_DF

# for testing the parseMessages function
#ft_RDD = sc.wholeTextFiles(dirPath1) # create an RDD with wholeTextFiles    
#txts = ft_RDD.take(3) # take into a local list
#txts2 = list(map(parseMessage,txts))# and apply removeHeader (NOTE: this is different from an RDD map!)
#print(txts2)

rows1_DF = make_dataFrame(dirPath1, 0)
rows2_DF = make_dataFrame(dirPath1, 1)
rows_DF = rows1_DF.union(rows2_DF)
rows_DF.createOrReplaceTempView("newsgroups")
print(rows_DF.count())
rows_DF.printSchema()

rows_DF.show(5)

make_dataFrame started
325091
fng2_RDD.count() 325091
make_dataFrame started
325091
fng2_RDD.count() 325091
2000
root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)
 |-- weight: double (nullable = true)

+--------------------+-----+------+
|            features|label|weight|
+--------------------+-----+------+
|[32.0,67.0,19.0,4...|  0.0|   1.0|
|[127.0,105.0,28.0...|  0.0|   1.0|
|[12.0,27.0,8.0,18...|  0.0|   1.0|
|[10.0,8.0,1.0,17....|  0.0|   1.0|
|[1.0,1.0,0.0,3.0,...|  0.0|   1.0|
+--------------------+-----+------+
only showing top 5 rows




## 3) Use the spark.ml cross validator

Now we can use the CrossValidator, which comes with the ML module. We only need to set up the parameters 

In [12]:
# We need to import the cliassifers from the ML package now.
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression

# The CrossValidator and ParamGridBuilder enable the automatic tuning
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder

# The evaluator tests the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator


df = sc.parallelize([
     Row(label=1.0, features=Vectors.dense(1.0)),
     Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF()
lr = LogisticRegression(maxIter=5, regParam=0.01)
df.printSchema()
model = lr.fit(df)
print(model.coefficients)
print(model.intercept)
results = model.transform( df)
print(results.show(3))


#df = spark.read.format("libsvm").load('sample_libsvm_data.txt')
#print('df.count()',df.count())

# WARNING the classifiers in ML seem to have a problem with out spam dataset 
# (at least when it has a non-trivial size). 
# I am investigating, the reason is not known at the moment. 
r1 = rows1_DF.head(50)
r2 = rows2_DF.head(50)
df = sc.parallelize(r1+r2).toDF()

lr = LogisticRegression(maxIter=20, regParam=0.0)
df.printSchema()
model = lr.fit(df)
print(model.coefficients)
print(model.intercept)
results = model.transform(df) 
results = results.filter(results.label==1)
print(results.show(5))

evaluator = BinaryClassificationEvaluator()
#lr = LogisticRegression(maxIter=20, regParam=0.0)
#grid = ParamGridBuilder().addGrid(lr.maxIter, [3, 10, 30, 100]).addGrid(lr.regParam, [0, 0.03, 30, 100]).build()
nb = NaiveBayes()
grid = ParamGridBuilder().addGrid(nb.smoothing, [0.03, 0.1, 0.3, 1, 3, 10]).build()
print("starting cross-validation")
#cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cv = CrossValidator(estimator=nb, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(df)
print("finished cross-validation")
print("cvModel.bestModel", cvModel.bestModel)
evaluator.evaluate(cvModel.transform(df))

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)

[5.48832004997]
-2.6357106786866553
+---------+-----+--------------------+--------------------+----------+
| features|label|       rawPrediction|         probability|prediction|
+---------+-----+--------------------+--------------------+----------+
|    [1.0]|  1.0|[-2.8526093712877...|[0.05454659212542...|       1.0|
|(1,[],[])|  0.0|[2.63571067868665...|[0.93312479504475...|       0.0|
+---------+-----+--------------------+--------------------+----------+

None
root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)
 |-- weight: double (nullable = true)

(100,[],[])
0.0
+--------------------+-----+------+-------------+-----------+----------+
|            features|label|weight|rawPrediction|probability|prediction|
+--------------------+-----+------+-------------+-----------+----------+
|[32.0,67.0,19.0,4...|  1.0|   1.0|   [-0.0,0.0]|  [0.5,0.5]|       0.0|
|[127.0,105.0,28.0..

0.5000000000000001