In [1]:
import pandas as pd
import nltk as nl 
from nltk.stem.porter import *
from nltk.corpus import stopwords
import sklearn as sk
import numpy as np
import matplotlib as mp
import os
import string
import math

Code for Preprocessing and Data Collection

In [2]:
def getArticleDic(path):
    articleDic = {}
    #this was to get rid of ".DS_store" files
    ds_store_file_location = path+'/.DS_store'
    if os.path.isfile(ds_store_file_location):
        os.remove(ds_store_file_location)
    for folder in os.listdir(path):
        folderPath = os.path.join(path, folder)
        folDic = {}
        for fileName in os.listdir(folderPath):
            filePath = os.path.join(folderPath, fileName)
            myfile = open(filePath)
            docid = fileName[:-4]
            file = myfile.readlines()
            docDic = {}
            for line in file:
                line = line.strip()
                if line.startswith("<p>"):
                    line = line.replace("<p>", "").replace("</p>", "")
                    line = line.translate(str.maketrans('','', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
                    line = line.replace("\\s+", " ")
                    for term in line.split():
                        stemmer = PorterStemmer()
                        term = stemmer.stem(term.lower())
                        stop_words = set(stopwords.words('english'))
                        if len(term) > 2 and term not in stop_words:
                            try:
                                docDic[term] += 1
                            except KeyError:
                                docDic[term] = 1
                folDic[docid] = docDic
            articleDic[folder] = folDic
    return articleDic

In [3]:
def parse_query(query):
    curr_doc = {}
    for line in query.replace('-', ' ').split(' '):
        line = line.strip()
        line = line.translate(str.maketrans('','', string.digits)).translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        #print(line)
        #print('-----------')
        line = line.replace("\\s+", " ")
        #term = stem(term.lower()) ## for wk 4
        line = line.replace(" ", "")
        stemmer = PorterStemmer()
        line = stemmer.stem(line.lower()) #wk3
        stop_words = set(stopwords.words('english'))
        if len(line) > 2 and line not in stop_words: #wk3
            try:
                curr_doc[line] += 1
            except KeyError:
                curr_doc[line] = 1
    return curr_doc



In [4]:
#getting topics for quieries
def getTitles(path):
    file = open(path)
    IDTitle = []
    for line in file:
        line = line.strip()
        if line.startswith("<num> "):
            for part in line.split():
                if part.startswith("R"):
                    docID = part
        if line.startswith("<title>"):
                #or part in line.split():
                docTitle = line[7:].strip(" ")
                IDTitle.append([docID, docTitle])
    return IDTitle

In [5]:
articleDic = getArticleDic('DataCollection') #gets dictionary with structure folder, ID, term-dictionary
#dataDir = getDirInfo("DataCollection") # gets directory for folders and files for dictionary
#Index_Test = indexDic(articleDic, dataDir)
dataTitle = getTitles("Topics.txt")

Information Likelihood Algotrithim

In [6]:
#this is the one we use
def IRLikelihood(article, query):
    invertList = {}
    idScoreDic = {}
    docLen = {}
    for queryKey in query.keys():
        queryMatch = {}
        for id in article.keys():
            idScoreDic[id]=1
            docLen[id]=0.5
            for key, value in article[id].items():
                if key == queryKey:
                    # to account for zero accourance all words with a value are added 1
                    queryMatch[id] = value + 1
                if not(queryKey in article[id]):
                    # to account for zero occurance all words with zero value = 1
                    queryMatch[id] = 1
            invertList[queryKey] = queryMatch

    for q_term in queryDic.items(): # L may not include all query terms
            if not(q_term[0] in invertList):
                invertList[q_term[0]]={}

    for id in article.keys():
        total = 0
        for value in article[id].values():
            total = total+value
        docLen[id] += total
    
    for (d, sd) in idScoreDic.items():
        for (term, f) in invertList.items():
            if not(d in f):
                f[d]=0.000
            sd = sd*(f[d]/docLen[d])
        idScoreDic[d] = sd
        
    return idScoreDic

In [7]:
LikelihoodRetrived = {}
for id, query in dataTitle:
    LikelihoodRetrivedList = []
    print("Topic {}".format(id))
    coll = str("Dataset" + id[1:])
    queryDic = parse_query(query)
    result1 = IRLikelihood(articleDic[coll], queryDic)
    x1 = sorted(result1.items(), key=lambda x: x[1],reverse=True)
    print('DocID        Weight')
    for doc, value in x1:
        if value > 0.0:
            LikelihoodRetrivedList.append(doc)
    LikelihoodRetrived[id] = LikelihoodRetrivedList
    for (doc, value) in x1:
        if value>0.0:
            print("{}       {}".format(doc, str(value)))

Topic R101
DocID        Weight
46974       0.001721108680841909
46547       0.001721108680841909
26642       0.0008910670527957228
62325       0.0004479157918311357
6146       0.0002938475665748393
63261       0.00018017206432142696
61329       0.00017821341055914458
22513       0.00011925880651749378
82454       8.816592827701735e-05
82912       8.340109672442191e-05
82330       3.4867351519562756e-05
61780       3.1644317867172976e-05
22170       2.5425398702033395e-05
80425       2.438652644413961e-05
81463       2.1734287468553203e-05
39496       1.5475055178243617e-05
77909       1.3271356100344723e-05
27577       1.2354792578476097e-05
30647       1.2354792578476097e-05
26847       1.2354792578476097e-05
83167       1.2268395692566273e-05
80950       1.2268395692566273e-05
18586       7.130112067536421e-06
Topic R102
DocID        Weight
76675       4.994964451462371e-06
78836       4.807084159867362e-06
24550       4.51579144043022e-06
9358       3.1112025458970432e-06
3835      

TDIDF Algorithim

In [8]:
#COULD BE A PROBLEM
def avgDocLen(artDic):
    numDoc = len(artDic)
    totalDocWords = 0  
    for id in artDic.keys():
        docTotal = 0
        for value in artDic[id].values():
            docTotal = docTotal+value
        totalDocWords = totalDocWords+docTotal
    avgDocLen = totalDocWords/numDoc
    return avgDocLen
    

In [9]:
def calc_df(artdic):
    df_ = {}
    for id in artdic.keys():
        for key in artdic[id].keys():
            try:
                df_[key] += 1
            except KeyError:
                df_[key] = 1
    return df_

In [10]:
#gets the TFIDF of a document, dirList is a list of all the keys for folder and document and artDic is the dictionary for the given document, CDF is calculated document frequency
def getTFIDF(artDic, df):
    tfDict = {}
    docFreq = df
    docTotal = 0
    numDoc = len(artDic)

    for value in artDic.values():
        docTotal = docTotal+value
    for key, value in artDic.items():
        tfDict[key] = value/float(docTotal)
    #inverse data frequency
    idfDict = {}
    for word, val in docFreq.items():
        idfDict[word] = math.log10(numDoc / float(val))
    #calculate the TF-IDF
    tfidfDic = {}
    for word, val in tfDict.items():
        tfidfDic[word] = val * idfDict[word]
    return tfidfDic


In [11]:
#remove things that can just be functions within, they dont have to be passed
def getTFIDFScore(artDic, df, qDic):
    TFIDFScore = {}

    for doc in artDic.keys():
        exOutput = getTFIDF(artDic[doc], df)
        queryValue = []
        for key, value in qDic.items():
            try:
                queryValue.append((exOutput[key]*value)+1)
            except KeyError:
                # to account for zero occurance
                queryValue.append(1)
        TFIDFScore[doc] = sum(queryValue)
    return TFIDFScore

In [12]:
TFIDFRetrived = {}

for id, query in dataTitle:
    TFIDFRetrivedDocs = []
    print("Topic {}".format(id))
    coll = str("Dataset" + id[1:])
    queryDic = parse_query(query)
    calwordfreq = calc_df(articleDic[coll])
    TFIDFScore = getTFIDFScore(articleDic[coll], calwordfreq, queryDic)
    x1 = sorted(TFIDFScore.items(), key=lambda x: x[1],reverse=True)
    print('DocID        Weight')
    for doc, value in x1[:10]:
            if value > 0:
                TFIDFRetrivedDocs.append(doc)
    TFIDFRetrived[id] = TFIDFRetrivedDocs
    for doc, value in x1[:10]:
        if value>0:
            print("{}       {}".format(doc, value))
    print("   ")

Topic R101
DocID        Weight
46974 2.06412530377204
46547 2.06412530377204
61329 2.025739727384357
62325 2.022081257341984
61780 2.0147720490722234
6146 2.0139771711668075
22170 2.0093160069150175
22513 2.008258860167147
82330 2.005385948582336
39496 2.0039644731864534
   
Topic R102
DocID        Weight
76635 3.0208674419767894
73038 3.019405859542359
58476 3.0159224727334584
12769 3.0141280949975204
12767 3.0138423831951417
78836 3.013701683930777
86929 3.0132340467143512
86912 3.013205708284556
26061 3.0116564459766746
57914 3.0091715460718818
   
Topic R103
DocID        Weight
81463 3.034166272253641
14314 3.029577009172752
26385 3.025209842617273
27426 3.023196006861074
85889 3.022542941060551
9272 3.0221977179283344
26386 3.0219194745947355
26258 3.0209251454141723
80988 3.020603869589716
20159 3.0204562847739918
   
Topic R104
DocID        Weight
77310 3.012901603936413
25205 3.010953317849685
11923 3.0093514459702195
22751 3.0091106345618686
16954 3.0091101494213586
11930 3.00

BM25 Algorithim

In [13]:
#should pass the dictionary, so artdic[key][key] passed
def articleLen(artDIc):
    totalWords = 0
    for value in artDIc.values():
        totalWords = totalWords + value
    return totalWords

In [14]:
def score_BM25(artDic, q, df):
    #stopwords_f = open('common-english-words.txt', 'r')
    #stop_words = stopwords_f.read().split(',')
    #stopwords_f.close()
    query_result = dict()
    k1 = 1.2
    k2 = 100
    b = 0.75
    R = 0.0
    N = len(artDic)
    r = 0.0

    #qf = parse_query(q, stop_words)
    avdl = avgDocLen(artDic)
    for id in artDic.keys():
        for term in q.keys():
            try:
                n = df[term]
            except KeyError:
                n= 0
            try:
                f = artDic[id][term] +1
            except KeyError:
                f = 1
            K = compute_K(articleLen(artDic[id]), avdl)
            first = math.log10( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
            second = ((k1 + 1) * f) / (K + f)
            third = ((k2+1) * q[term]) / (k2 + q[term])
            score = first * second * third

            if id in query_result: #this document has already been scored once
                query_result[id] += score
            else:
                query_result[id] = score
    return query_result

"""
    for key, values in query_result.items():
        print("Document ID: {}, Doc Lenght: {} -- BM25 Score: {}".format(key, doc[2], values))
    print("\nThe following are possibly relevant documents retrieved -")
    for key, values in BowDoc.OrderDic(query_result).items():
        print("{} {}".format(key, values))
"""


'\n    for key, values in query_result.items():\n        print("Document ID: {}, Doc Lenght: {} -- BM25 Score: {}".format(key, doc[2], values))\n    print("\nThe following are possibly relevant documents retrieved -")\n    for key, values in BowDoc.OrderDic(query_result).items():\n        print("{} {}".format(key, values))\n'

In [15]:
def compute_K(dl, avdl):
	return 1.2 * ((1-0.75) + 0.75 * (float(dl)/float(avdl)) )

In [16]:
BM25Retrived = {}

for id, query in dataTitle:
    BM25RetrivedDocs = []
    print("Topic {}".format(id))
    coll = str("Dataset" + id[1:])
    queryDic = parse_query(query)
    calwordfreq = calc_df(articleDic[coll])
    score4BM225 = score_BM25(articleDic[coll], queryDic, calwordfreq)
    x1 = sorted(score4BM225.items(), key=lambda x: x[1],reverse=True)
    print('DocID        Weight')
    for doc, value in x1:
        if value > 0:
            BM25RetrivedDocs.append(doc)
    BM25Retrived[id] = BM25RetrivedDocs
    for doc, value in x1[:10]:
        if value > 0:
            print("{}       {}".format(doc, value))
    print("         ")

Topic R101
DocID        Weight
46974       1.4656270142917243
46547       1.4656270142917243
62325       1.2805026674263522
6146       1.2199907640753658
26642       1.201137956949411
63261       1.0696186969748314
61329       1.0495613649942817
22513       1.0253809336255475
82454       0.9854059619478355
82912       0.9781858911275936
         
Topic R102
DocID        Weight
73038       1.7055715242186655
26061       1.4803132777889092
57914       1.4640482418068008
65414       1.4187147543174339
76635       1.3550191387115162
58476       1.3533888979023136
12769       1.3120940427376941
12767       1.2872884428885363
76675       1.254503029943308
24550       1.2492595939925248
         
Topic R103
DocID        Weight
14314       1.7966570235668393
63966       1.5981687770409847
27106       1.5954903010573787
26642       1.5743464651603245
80988       1.5518035075349015
54533       1.5268480956672055
9272       1.5226054969279996
27537       1.5202803638626166
55692       1.516209692

Test and Evaluation

In [17]:
#getting relevant doc IDs for queires
def getFeedback(path): #put all feedbacks into an array by each line
    txts = os.listdir(path)
    wholefeedback=[]
    for txt in txts:
        filePath = os.path.join(path, txt)
        feedback = open(filePath,'r').read() #this had to be changed for difference in operating system, may have to edit for yours
        lines = feedback.split('\n')
        lines.remove('')
        for line in lines:
            wholefeedback.append(line)
    return wholefeedback

In [18]:
def getDsetRelDocs(path): #Remove all 0 relevence feedback
    relFeedback = getFeedback(path)
    removelist = []
    for feedback in relFeedback:
        if feedback.endswith('0'):
            removelist.append(feedback)
    while len(removelist) != 0 :
        for feedback in relFeedback:
            if feedback in removelist:
                relFeedback.remove(feedback)
                removelist.remove(feedback)
    return relFeedback

In [19]:
def getDsetRelDocsTuple(path): #Ruturn tuple of {dataset : [relevence doc(s) id]}
    dataSetRelDocsTuple = {}
    orderTag = []
    orderDic = {}
    for feedback in getDsetRelDocs(path):
        ids = feedback.split(' ')
        tupleBuffer = ids[1]
        try:
            dataSetRelDocsTuple[ids[0]].append(tupleBuffer)    
        except (KeyError):
            dataSetRelDocsTuple[ids[0]] = [tupleBuffer]
    for key in dataSetRelDocsTuple.keys():
        orderTag.append(key[1:])
    orderTag.sort()
    for tag in orderTag:
        orderDic["R{}".format(tag)] = dataSetRelDocsTuple["R{}".format(tag)]

    return orderDic

In [20]:
def findRelevantDocs(relDocs, retrivedDoc):
    relDocsDic = {}
    for key, relList in relDocs.items():
        totalRelDocs = 0
        for value in retrivedDoc[key]:
            if value in relList:
                totalRelDocs = totalRelDocs +1
        relDocsDic[key] = totalRelDocs
    return relDocsDic

In [21]:
def getEval(RelDocs, RetrievedDocs):
    #recallDic = {}
    #percisionDic = {}
    #F1Dic = {}
    collDic = {}
    relTFIDF = findRelevantDocs(RelDocs, RetrievedDocs)
    for key in RelDocs.keys():
        evalList = []
        if relTFIDF[key] + len(RelDocs[key]) == 0:
            recall = 0
        else:
            recall = float(relTFIDF[key])/float(len(RelDocs[key]))
        if relTFIDF[key] + len(RetrievedDocs[key]) == 0:
            percision = 0
        else:
            percision =  float(relTFIDF[key])/float(len(RetrievedDocs[key]))
        
        if percision+recall == 0:
            F1score = 0
        else:
            F1score = float(2*percision*recall)/float((percision+recall))
        
        evalList.append([recall, percision, F1score])
        collDic[key] = [recall, percision, F1score]
    return collDic




In [22]:
relevantID = getDsetRelDocsTuple('RelevanceFeedback')

In [23]:
len(BM25Retrived["R104"])

0

In [24]:
likelihoodScores = getEval(relevantID, LikelihoodRetrived)
Likelihoodpd = pd.DataFrame.from_dict(likelihoodScores, orient='index', columns=['Recall(LikeLihood)', 'Percision (Likelihood)', 'F1 (Likelihood)'])
Likelihoodpd


Unnamed: 0,Recall(LikeLihood),Percision (Likelihood),F1 (Likelihood)
R101,1.0,0.304348,0.466667
R102,1.0,0.678392,0.808383
R103,1.0,0.21875,0.358974
R104,1.0,0.618557,0.764331
R105,1.0,0.432432,0.603774
R106,1.0,0.090909,0.166667
R107,1.0,0.04918,0.09375
R108,1.0,0.056604,0.107143
R109,1.0,0.5,0.666667
R110,1.0,0.054945,0.104167


In [25]:
Likelihoodpd["F1 (Likelihood)"].mean()

0.31671805097388117

In [26]:
TFIDFScores = getEval(relevantID, TFIDFRetrived)
TFIDFpd = pd.DataFrame.from_dict(TFIDFScores, orient='index', columns=['Recall(TFIDF)', 'Percision (TFIDF)', 'F1 (TFIDF)'])
TFIDFpd

Unnamed: 0,Recall(TFIDF),Percision (TFIDF),F1 (TFIDF)
R101,0.714286,0.5,0.588235
R102,0.02963,0.4,0.055172
R103,0.142857,0.2,0.166667
R104,0.058333,0.7,0.107692
R105,0.5,0.8,0.615385
R106,0.25,0.1,0.142857
R107,0.666667,0.2,0.307692
R108,0.666667,0.2,0.307692
R109,0.35,0.7,0.466667
R110,0.8,0.4,0.533333


In [27]:
TFIDFpd["F1 (TFIDF)"].mean()

0.32521706873907946

In [28]:
BM25Scores = getEval(relevantID, BM25Retrived)
BM25pd = pd.DataFrame.from_dict(BM25Scores, orient='index', columns=['Recall(BM25)', 'Percision (BM25)', 'F1 (BM25)'])
BM25pd


Unnamed: 0,Recall(BM25),Percision (BM25),F1 (BM25)
R101,1.0,0.304348,0.466667
R102,1.0,0.678392,0.808383
R103,1.0,0.21875,0.358974
R104,0.0,0.0,0.0
R105,0.0,0.0,0.0
R106,0.5,0.095238,0.16
R107,1.0,0.04918,0.09375
R108,1.0,0.056604,0.107143
R109,0.0,0.0,0.0
R110,1.0,0.054945,0.104167


In [29]:
BM25pd["F1 (BM25)"].mean()

0.17738502960049604

In [30]:
print("Mean of Recall score for Likelihood model is {}, TFIDF is {}, BM25 is {}".format(round(Likelihoodpd["Recall(LikeLihood)"].mean(), 3), round(TFIDFpd["Recall(TFIDF)"].mean(), 3), round(BM25pd["Recall(BM25)"].mean(), 3)))
print(" ")
print("Mean of Percision score for Likelihood model is {}, TFIDF is {}, BM25 is {}".format(round(Likelihoodpd["Percision (Likelihood)"].mean(), 3), round(TFIDFpd["Percision (TFIDF)"].mean(), 3), round(BM25pd["Percision (BM25)"].mean(), 3)))
print(" ")
print("Mean of F1 score for Likelihood model is {}, TFIDF is {}, BM25 is {}".format(round(Likelihoodpd["F1 (Likelihood)"].mean(), 3), round(TFIDFpd["F1 (TFIDF)"].mean(), 3), round(BM25pd["F1 (BM25)"].mean(), 3)))


Mean of Recall score for Likelihood model is 1.0, TFIDF is 0.447, BM25 is 0.566
 
Mean of Percision score for Likelihood model is 0.207, TFIDF is 0.318, BM25 is 0.12
 
Mean of F1 score for Likelihood model is 0.317, TFIDF is 0.325, BM25 is 0.177


In [31]:
from scipy.stats import ttest_ind

LHTF_Ttest = ttest_ind(Likelihoodpd['F1 (Likelihood)'], TFIDFpd['F1 (TFIDF)'])
print("T-test for signifigant difference between F1 scores of Likelihood model and TFIDF model with signifigance set to P<0.05")
print("P value = {}".format(LHTF_Ttest[1]))
LHBM_Ttest = ttest_ind(Likelihoodpd['F1 (Likelihood)'], BM25pd['F1 (BM25)'])
print("T-test for signifigant difference between F1 scores of Likelihood model and BM25 model with signifigance set to P<0.05")
print("P value = {}".format(LHBM_Ttest[1]))
TFBM_Ttest = ttest_ind(TFIDFpd['F1 (TFIDF)'], BM25pd['F1 (BM25)'])
print("T-test for signifigant difference between F1 scores of TFIDF model and BM25 model with signifigance set to P<0.05")
print("P value = {}".format(TFBM_Ttest[1]))

T-test for signifigant difference between F1 scores of Likelihood model and TFIDF model with signifigance set to P<0.05
P value = 0.8296143443267455
T-test for signifigant difference between F1 scores of Likelihood model and BM25 model with signifigance set to P<0.05
P value = 0.0005141252529721997
T-test for signifigant difference between F1 scores of TFIDF model and BM25 model with signifigance set to P<0.05
P value = 0.00021260001592794274
