In [12]:
# This cell doesn't depend on the previous ones
# It can run by itself!

# Enter your code here
# Load the SGNews Corpus
import nltk
from nltk.corpus import PlaintextCorpusReader
speechCorpus = PlaintextCorpusReader('Dataset', '.+\.txt')
speechFids = speechCorpus.fileids()
# print(speechFids)

# For each file ID in fids, the code below takes its words. All these files are
# then combined into a single list called 'docs'.
speechDocs = [speechCorpus.words(f) for f in speechCorpus.fileids()]

# Convert words to lower case
speechLowered = [[w.lower() for w in doc] for doc in speechDocs]

# Use regular expression to keep only alphabetic words.
import re
speechAlpha = [[w for w in doc if re.search('^[a-z]+$|377a',w)] for doc in speechLowered]

# Remove stop words
from nltk.corpus import stopwords
stopList = stopwords.words('english')
speechStopped = [[w for w in doc if w not in stopList] for doc in speechAlpha]

# Stem the words: Porter stemming.
from nltk.stem.porter import *
stemmer = PorterStemmer()
speechStemmed = [[stemmer.stem(w) for w in doc] for doc in speechStopped]

# Create a dictionary
import gensim
from gensim import corpora
speechDictionary = corpora.Dictionary(speechStemmed)
# print(speechDictionary)

# Convert documents to vectors
speechVectors = [speechDictionary.doc2bow(doc) for doc in speechStemmed]

# To find the similarity scores, create a reverse index
from gensim import similarities
speechIndex = similarities.SparseMatrixSimilarity(speechVectors, len(speechDictionary))

# Create a TFIDF reveset index
from gensim import models
speechTFIDF = models.TfidfModel(speechVectors)
speechVectorsWithTFIDF = [speechTFIDF[vec] for vec in speechVectors]
speechIndexWithTFIDF = similarities.SparseMatrixSimilarity(speechVectorsWithTFIDF, len(speechDictionary))

In [17]:
# A Query string
query = "haze from indonesia"
qList = query.split()
qLower = [w.lower() for w in qList]

# Stem it (no need to use stop words <= will not be there in the index any way)
qStemmed = [stemmer.stem(w) for w in qLower]

# Create a query vector using the same dicitonary as the corpus
qVector = speechDictionary.doc2bow(qStemmed)

# Get its TFIDF from the same model as the corpus
qVectorTFIDF = speechTFIDF[qVector]
# print(qVector, qVectorTFIDF)

# Get the similaries from the two indexes (raw and TFIDF)
simRaw = speechIndex[qVector]
simTFIDF = speechIndexWithTFIDF[qVectorTFIDF]

# sort them
simSorted = sorted(enumerate(simRaw), key = lambda item: -item[1])
simTFIDFSorted = sorted(enumerate(simTFIDF), key = lambda item: -item[1])

# print("Raw search scores: ", simSorted[0:5])
# print("TFIDF search scores: ", simTFIDFSorted[0:5])

# Translate the file ids to filenames
# In the list comprehension, sim is a tuple, and its elements are extracted just like a list
# Note that the output also is a list of tuples, with the first element replaced by the filename
fRaw = [(speechCorpus.fileids()[sim[0]], sim[1]) for sim in simSorted[0:5]]
fTFIDF = [(speechCorpus.fileids()[sim[0]], sim[1]) for sim in simTFIDFSorted[0:5]]

# Using the * operator, you can use the separator
# See https://treyhunner.com/2018/10/asterisks-in-python-what-they-are-and-how-to-use-them/
print("Search results using RAW scores. (Filename, score):")
print(*fRaw, sep = '\n')
print("\nSearch results using TFIDF scores. (Filename, score):")
print(*fTFIDF, sep = '\n')

Search results using RAW scores. (Filename, score):
('2015_chi.txt', 0.053266563)
('2016_eng.txt', 0.030846732)
('2015_eng.txt', 0.023017528)
('2006_eng.txt', 0.018219689)
('2010_mal.txt', 0.015515535)

Search results using TFIDF scores. (Filename, score):
('2015_chi.txt', 0.06582709)
('2015_eng.txt', 0.053857617)
('2016_eng.txt', 0.039161)
('2006_eng.txt', 0.017614689)
('2014_eng.txt', 0.013898119)
