In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from nltk.stem import PorterStemmer
import pickle
import os
import math
from num2words import num2words  
ps = PorterStemmer()

In [2]:
def getAllFoldersPath():
    filesList=[]
    path="20_newsgroups"
    filesList =  [x[0] for x in os.walk(path)]
    return filesList[1:]

In [3]:
def getNumWordFixed(fileContent):
    result = []
    for word in fileContent:
            if(word.isnumeric()):
                word = num2words(word)
                tokenizer=RegexpTokenizer(r'([A-Za-z0-9]+)')
                tokensList=tokenizer.tokenize(word)
                result+=tokensList
            else:
                result.append(word)
                
    return result

In [4]:
def removeStopWords(words):
    filteredWords = []
    stop_words=stopwords.words('english')
    for word in words:
        if word not in stop_words:
            word = ps.stem(word)
            filteredWords.append(word)
    return filteredWords

def loadFromPickle(pickleFile):
	file = open(pickleFile,'rb')
	pickleData = pickle.load(file)
	file.close()
	return pickleData

def saveInPickle(data, pickleFile):
	file = open(pickleFile,"wb")
	pickle.dump(data,file)
	file.close()

In [5]:
def getSortedFilePaths(foldersPathList):
    sortedPaths = {}
    for folderCount, path in enumerate(foldersPathList):
        sortedPaths[path] = []
        for r, d, f in os.walk(path):
            for file in f:
                docId = int(file)
                docId = docId*100+folderCount
                sortedPaths[path].append(docId)
                
        sortedPaths[path].sort()
    return sortedPaths

In [6]:
def getAllTokens(sortedPaths):
    count=0
    staticScore = {}
    docIdsSet = []
    docWordsDictionary = {}
    vocab = []
    for path in sortedPaths:
        print(path)
        for file in sortedPaths[path]:
            filePath = str(int(file/100))
            filePath = path + "/" + filePath
            fileReference = open(filePath, "r", encoding = "ISO-8859-1")
            fileContent=fileReference.read()
            fileContent = fileContent.lower()                
            tokenizer=RegexpTokenizer(r'([A-Za-z0-9]+)')
            tokensList=tokenizer.tokenize(fileContent)
            fileContent = getNumWordFixed(tokensList)
            filteredWords = removeStopWords(fileContent)
            docWordsDictionary[file] = filteredWords
            vocab+=filteredWords
    return docWordsDictionary, list(set(vocab))

In [31]:
foldersPathList = getAllFoldersPath()
foldersPathList.sort()
sortedPaths = getSortedFilePaths(foldersPathList)

In [31]:
docWords, vocab = getAllTokens(sortedPaths)

20_newsgroups/sci.med
20_newsgroups/sci.space
20_newsgroups/comp.os.ms-windows.misc
20_newsgroups/rec.autos
20_newsgroups/rec.motorcycles
20_newsgroups/misc.forsale
20_newsgroups/alt.atheism
20_newsgroups/sci.electronics
20_newsgroups/rec.sport.hockey
20_newsgroups/comp.graphics
20_newsgroups/talk.politics.misc
20_newsgroups/talk.politics.guns
20_newsgroups/sci.crypt
20_newsgroups/comp.sys.ibm.pc.hardware
20_newsgroups/comp.windows.x
20_newsgroups/talk.politics.mideast
20_newsgroups/soc.religion.christian
20_newsgroups/comp.sys.mac.hardware
20_newsgroups/talk.religion.misc
20_newsgroups/rec.sport.baseball


In [90]:
saveInPickle(docWords, "docWords")
saveInPickle(vocab, "vocab")

In [8]:
def getStaticQualityScore(foldersPathList, sortedPaths):
    qualityScores = {}
    count = 0
    max = 0
    scoreFile = open("file.txt", "r")
    scores = scoreFile.read()
    scores = scores.split("\n")
    scores = scores[:-1]
    for path in foldersPathList:
        print(path)
        for file in sortedPaths[path]:
            score = scores[count].split()
#             print(file,score)
            qualityScores[file] = float(score[1])
            if(max < float(score[1])):
                max = float(score[1])
            count+=1
        print(count)
    print(max)
    for file in qualityScores:
        qualityScores[file] = qualityScores[file]/max
    return qualityScores

In [75]:
qualityScores = getStaticQualityScore(foldersPathList, sortedPaths)

20_newsgroups/alt.atheism
1000
20_newsgroups/comp.graphics
2000
20_newsgroups/comp.os.ms-windows.misc
3000
20_newsgroups/comp.sys.ibm.pc.hardware
4000
20_newsgroups/comp.sys.mac.hardware
5000
20_newsgroups/comp.windows.x
6000
20_newsgroups/misc.forsale
7000
20_newsgroups/rec.autos
8000
20_newsgroups/rec.motorcycles
9000
20_newsgroups/rec.sport.baseball
10000
20_newsgroups/rec.sport.hockey
11000
20_newsgroups/sci.crypt
12000
20_newsgroups/sci.electronics
13000
20_newsgroups/sci.med
14000
20_newsgroups/sci.space
15000
20_newsgroups/soc.religion.christian
15997
20_newsgroups/talk.politics.guns
16997
20_newsgroups/talk.politics.mideast
17997
20_newsgroups/talk.politics.misc
18997
20_newsgroups/talk.religion.misc
19997
49.0


In [91]:
saveInPickle(qualityScores, "qualityScores")

In [79]:
qualityScores[5112000]

0.16326530612244897

In [9]:
def getTfValues(docWords):
    tf = {}
    count = 0
    for doc in docWords:
        for word in docWords[doc]:
            if word not in tf:
                tf[word] = {}
            if doc not in tf[word]:
                tf[word][doc] = 0
            tf[word][doc]+=1
    return tf

In [54]:
tfValues = getTfValues(docWords)

In [18]:
saveInPickle(tfValues, "tfValues")

NameError: name 'tfValues' is not defined

In [10]:
def processTupleList(tupleList):
    plainList = []
    for item in tupleList:
        plainList.append(item[0])
    return plainList

In [11]:
def sortByQualityScore(docList):
    docScoreList = []
    for doc in docList:
        docScoreList.append((doc, qualityScores[doc]))
    docScoreList.sort(key = lambda x: x[1], reverse = True)
    docScoreList = processTupleList(docScoreList)
    return docScoreList

In [12]:
def getChampionList(tfValues, r):
    championList = {}
    for word in tfValues:
        postList = []
        for doc in tfValues[word]:
            postList.append((doc, tfValues[word][doc]))
        postList.sort(key = lambda x: x[1], reverse = True)  
        championList[word] = {}
        championList[word]["high"] = sortByQualityScore(processTupleList(postList[:r]))
        championList[word]["low"] = processTupleList(postList[r:])
    return championList

In [87]:
globalChampList = getChampionList(tfValues, 20)

In [49]:
globalChampList50 = getChampionList(tfValues, 50)

In [53]:
globalChampList100 = getChampionList(tfValues, 100)

In [54]:
len(globalChampList100)

141519

In [93]:
saveInPickle(globalChampList, "globalChampList")

In [55]:
saveInPickle(globalChampList50, "globalChampList50")
saveInPickle(globalChampList100, "globalChampList100")

In [13]:
def getIdfVal(idfCount):
    N= len(docWords)
#     print(N, idfCount)
    return math.log(N/(1+idfCount), 10)

def tfVariant(freq):
    return 1 + math.log(freq, 10)

def getIdf(tfValues):
    idf = {}
    for word in tfValues:
        nt = len(tfValues[word])
        idf[word] = getIdfVal(nt)
    return idf

In [107]:
idf = getIdf(tfValues)

In [108]:
idf

{'calvinist': 3.39787485961345,
 'islamistyki': 3.9999348509414125,
 'csuohio': 2.9999348509414125,
 'pc': 1.057182930511599,
 'vyjx': 3.9999348509414125,
 'tqv': 3.9999348509414125,
 'bev': 3.5228135962217504,
 'disproven': 3.5228135962217504,
 'oxz12b1w164w': 3.259572161447169,
 'pcq': 3.9999348509414125,
 'rostream': 3.8238435918857316,
 'c5sdx7': 3.9999348509414125,
 'godada': 3.6019948422693746,
 '1otn': 3.9999348509414125,
 'g56': 3.8238435918857316,
 'stiegerwald': 3.9999348509414125,
 'hiroto': 3.8238435918857316,
 'd5az5': 3.9999348509414125,
 'gjt': 3.9999348509414125,
 'isbm': 3.9999348509414125,
 'holmertz': 3.9999348509414125,
 '100nm': 3.9999348509414125,
 'edlin': 3.6989048552774313,
 'misleadingli': 3.8238435918857316,
 '1080i': 3.9999348509414125,
 'abqhh': 3.5228135962217504,
 'unidentifi': 3.07051592522712,
 'rimail': 3.6989048552774313,
 'm7iygmpk': 3.9999348509414125,
 'acapulco': 3.8238435918857316,
 'mamuretulaziz': 3.6019948422693746,
 '2pl': 3.6019948422693746,

In [109]:
saveInPickle(idf, "idf")

In [14]:
def preprocesQuery(query):
    query = query.lower()
    tokenizer=RegexpTokenizer(r'([A-Za-z0-9]+)')
    tokensList=tokenizer.tokenize(query)
    fileContent = getNumWordFixed(tokensList)
    filteredWords = removeStopWords(fileContent)
    return filteredWords

In [15]:
def checkOutOfVocab(query):
    for word in query:
        if word not in vocab:
            print("query word :", word,"is out of vocab")
            break

In [98]:
query

['american', 'atheist', 'press', 'publish']

In [35]:
def getCommonDocsList(query, k,champList):
    commonhighList = champList[query[0]]["high"]
    commonLowList = []
#     print(commonhighList)
    for word in query:
        highList  = champList[word]["high"]
#         print("\n",highList)
        commonhighList = [x for x in highList if x in commonhighList]
    if(len(commonhighList)>=k):
        return commonhighList, commonLowList
    else:
        commonLowList = champList[query[0]]["low"]
        for word in query:
            lowList = champList[word]["low"]
            commonLowList = [x for x in lowList if x in commonLowList]
            
    return commonhighList,commonLowList

def getUnionDocsList(query, k,champList):
    commonhighList = champList[query[0]]["high"]
    commonLowList = []
    for word in query:
        highList  = champList[word]["high"]
        commonhighList = commonhighList + highList
        commonhighList = list(set(commonhighList))
    if(len(commonhighList)>=k):
        return list(set(commonhighList)), list(set(commonLowList))
    else:
        commonLowList = champList[query[0]]["low"]
        for word in query:
            lowList = champList[word]["low"]
#             commonLowList = [x for x in lowList if x in commonLowList]
            commonLowList = commonLowList + lowList
            
    return list(set(commonhighList)), list(set(commonLowList))

In [17]:
def generateQueryVector(query):
    queryVector = []
    queryUnique = list(set(query))     
    for word in queryUnique:
        if(word in vocab):
            tf = 1 + math.log(query.count(word), 10)
            tfIdf = tf*idf[word]    
        else:
            tfIdf = 0
        queryVector.append(tfIdf)
    return queryVector

def generateDocVector(query, doc):
    docVector = []
    for word in list(set(query)):
        sim = 0
        if(doc in tfValues[word]):
            tfIdf = tfValues[word][doc]*idf[word]
        else:
            tfIdf = 0
        docVector.append(tfIdf)    
    return docVector

def cosineSimilarity(queryVector, docVector):
    score = 0
    qMod = dMod = 0
    for i in range(len(queryVector)):
        score+=queryVector[i]*docVector[i]
        qMod+=queryVector[i]*queryVector[i]
        dMod+=docVector[i]*docVector[i]
    if(qMod==0 or dMod==0):
        score = 0
    else:
        qMod = math.sqrt(qMod)
        dMod = math.sqrt(dMod)
        score = score/(qMod*dMod)
    return score

In [18]:
def getResults(scoreDocList, k):
    docList = []
    count = 0
    scoreDocList.sort(key = lambda x: x[1], reverse = True)
    if(len(scoreDocList)>= k):
        count = k
    else:
        count = len(scoreDocList)
    flag = {}
    for item in scoreDocList:
        docList.append(item[0])
        docId = item[0]
        if docId in flag:
            pass
        flag[docId] = 1
        staticScore = qualityScores[docId]
        folder = int(docId%100)
        docId = int(docId/100)
        print(foldersPathList[folder], docId)
        count-=1
        if(count==0):
            break

In [20]:
def calculateNetScore(highList, lowList):
    queryUnique = list(set(query)) 
    queryVector = generateQueryVector(queryUnique)
    scoreDocList = []
    for doc in highList:
        docVector = generateDocVector(queryUnique, doc)
        sim = cosineSimilarity(queryVector, docVector)
        netScore = qualityScores[doc] + sim
        scoreDocList.append((doc, netScore))
        
    if(len(scoreDocList) < k):
        for doc in lowList:
            docVector = generateDocVector(queryUnique, doc)
            sim = cosineSimilarity(queryVector, docVector)
            netScore = qualityScores[doc] + sim
            scoreDocList.append((doc, netScore))

    return scoreDocList

In [21]:
#######--------------------loading data from pickle-----------------------------------------------------
docWords = loadFromPickle("docWords")
vocab = loadFromPickle("vocab")
globalChampList = loadFromPickle("globalChampList")
globalChampList50 = loadFromPickle("globalChampList50")
globalChampList100 = loadFromPickle("globalChampList100")

qualityScores = loadFromPickle("qualityScores")
idf = loadFromPickle("idf")
tfValues = loadFromPickle("tfValues")

In [22]:
###########---------------query input --------------------

query = input("enter the query string : ")
query = preprocesQuery(query)
k = int(input("enter the value of k :"))

enter the query string : this is not false
enter the value of k :10


In [26]:
query

['fals']

In [24]:
checkOutOfVocab(query)

In [37]:
#############-----------(set r as per your wish )-----------------------------------

globalChampListBig = getChampionList(tfValues, 20)

print("\n -------------------------high and low list union algorithm based retrieval")
commonHighList, commonLowList = getUnionDocsList(query, k, globalChampListBig)
scoreDocList = calculateNetScore(commonHighList, commonLowList)
getResults(scoreDocList, k)


print("\n ----------------------- high and low intersection algorithm based retrieval")
commonHighList, commonLowList = getCommonDocsList(query, k, globalChampListBig)
scoreDocList = calculateNetScore(commonHighList, commonLowList)
getResults(scoreDocList, k)





 -------------------------high and low list union algorithm based retrieval
20_newsgroups\comp.windows.x 67012
20_newsgroups\soc.religion.christian 21542
20_newsgroups\comp.windows.x 66409
20_newsgroups\talk.religion.misc 83699
20_newsgroups\talk.politics.misc 178688
20_newsgroups\comp.windows.x 68332
20_newsgroups\comp.windows.x 66322
20_newsgroups\talk.religion.misc 84066
20_newsgroups\alt.atheism 54189
20_newsgroups\alt.atheism 53607

 ----------------------- high and low intersection algorithm based retrieval
20_newsgroups\comp.windows.x 67012
20_newsgroups\soc.religion.christian 21542
20_newsgroups\comp.windows.x 66409
20_newsgroups\talk.religion.misc 83699
20_newsgroups\talk.politics.misc 178688
20_newsgroups\comp.windows.x 68332
20_newsgroups\comp.windows.x 66322
20_newsgroups\talk.religion.misc 84066
20_newsgroups\sci.crypt 15987
20_newsgroups\alt.atheism 53607


In [22]:
###############################################
###############################################----------stop here !!!!!!!!!!!!!!!!!!!!!!!!!!!!
###############################################
###############################################

In [175]:
####----r = 20 -----------------------------################
commonHighList, commonLowList = getCommonDocsList(query, k, globalChampList)
scoreDocList = calculateNetScore(commonHighList, commonLowList)
getResults(scoreDocList, k)

20_newsgroups/comp.sys.ibm.pc.hardware 60381
20_newsgroups/comp.sys.ibm.pc.hardware 60172
20_newsgroups/comp.sys.mac.hardware 52004
20_newsgroups/comp.sys.ibm.pc.hardware 60392
20_newsgroups/talk.politics.mideast 75886
20_newsgroups/talk.politics.misc 178918
20_newsgroups/talk.politics.mideast 76071
20_newsgroups/rec.autos 102764


In [171]:
####----r = 50 -----------------------------################
commonHighList, commonLowList = getCommonDocsList(query, k, globalChampList50)
scoreDocList = calculateNetScore(commonHighList, commonLowList)
getResults(scoreDocList, k)

20_newsgroups/comp.sys.ibm.pc.hardware 60381
20_newsgroups/comp.sys.ibm.pc.hardware 60392
20_newsgroups/talk.politics.mideast 75886
20_newsgroups/talk.politics.misc 178918
20_newsgroups/talk.politics.mideast 76071
20_newsgroups/rec.autos 102764


In [172]:
####----r = 100 -----------------------------################
globalChampList100 = getChampionList(tfValues, 100)
commonHighList, commonLowList = getCommonDocsList(query, k, globalChampList100)
scoreDocList = calculateNetScore(commonHighList, commonLowList)
getResults(scoreDocList, k)

20_newsgroups/comp.sys.ibm.pc.hardware 60381
20_newsgroups/comp.sys.ibm.pc.hardware 60392
20_newsgroups/rec.autos 102764


In [173]:
####----r = 250 -----------------------------################
globalChampList250 = getChampionList(tfValues, 250)
commonHighList, commonLowList = getCommonDocsList(query, k, globalChampList250)
scoreDocList = calculateNetScore(commonHighList, commonLowList)
getResults(scoreDocList, k)

20_newsgroups/comp.sys.mac.hardware 51892
20_newsgroups/sci.crypt 15177


In [43]:
query

['new',
 'sinc',
 'version',
 'eighteen',
 'april',
 'one',
 'thousand',
 'nine',
 'hundr',
 'nineti',
 'three']