In [1]:
import sys
from importlib import reload
import logging
import csv
from nltk.stem.porter import PorterStemmer
import string
import gensim
import pickle
from gensim.summarization.bm25 import BM25


porterStemmer = PorterStemmer()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def removeNonAscii(s):
    return "".join(filter(lambda x:ord(x) < 128, s))


def preprocess(passage):
    clean_passage = passage.lower()
    clean_passage = removeNonAscii(clean_passage)
    clean_passage = clean_passage.translate(str.maketrans('', '', string.punctuation)) 
    
    clean_passage_list = clean_passage.split()
    clean_passage_list_stem = []
    for word in clean_passage_list:
        if word not in gensim.parsing.preprocessing.STOPWORDS:
            clean_passage_list_stem.append(porterStemmer.stem(word))
    return clean_passage_list_stem



In [3]:
from collections import defaultdict
import csv


In [2]:


collections = defaultdict(str)
with open("C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.tsv") as fr:
    reader = csv.reader(fr, delimiter="\t")
    for row in reader:
        collections[int(row[0])] = row[1]

In [4]:
queryFile = "C:\\Users\\mapyredd\\Documents\\marco\\data\\top15.eval.tsv"
query_text = defaultdict(str)
qrels = defaultdict(list)
with open(queryFile, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        query_text[int(row[0])] = row[2]
        qrels[int(row[0])].append(int(row[1]))

In [5]:
print(len(query_text))
print(len(qrels))

6837
6837


In [6]:
import pickle

with open("C:\\Users\\mapyredd\\Documents\\marco\\data\\model\\bm25.pkl", 'rb') as fr:
    bm25 = pickle.load(fr)
    
average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) 

In [7]:
def getBM25Feature():
    # key: (qid, docid, rel)
    # value: score 
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = bm25.get_score(query_list, docid, average_idf)
            scores[(qid, docid, 1)] = score
            
    return scores

In [8]:
def getDocLengthFeature():
    # key: (qid, docid, rel)
    # value: doc length
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        for docid in docid_list:
            score = len(collections[docid].split())
            scores[(qid, docid, 1)] = score

    return scores

In [9]:
def getCoverageFeature():
    # key: (qid, docid, rel)
    # value: score -> contains query term coverage and ratio
    zero_length_queries = 0
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = len(set(query_list).intersection(set(collections[docid].split())))
            scores[(qid, docid, 1)] = (score, float(score)/(len(query_list) + 1))
            if len(query_list) == 0:
                zero_length_queries += 1
    
    print (zero_length_queries)
    
    return scores

In [10]:
from gensim.models import TfidfModel
from gensim import corpora
from gensim.corpora import Dictionary

dictionary = corpora.Dictionary.load("C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.dict")

class MyCorpus(object):
    def __iter__(self):
        with open('C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.tsv') as fr:
            reader = csv.reader(fr, delimiter = "\t")
            for row in reader:
                # assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(row[1].lower().split())

tfidfmodel = TfidfModel(MyCorpus())

2018-07-22 18:28:44,481 : INFO : loading Dictionary object from C:\Users\mapyredd\Documents\marco\data\collection_cleaned.dict
2018-07-22 18:28:45,508 : INFO : loaded C:\Users\mapyredd\Documents\marco\data\collection_cleaned.dict
2018-07-22 18:28:45,528 : INFO : collecting document frequencies
2018-07-22 18:28:45,538 : INFO : PROGRESS: processing document #0
2018-07-22 18:28:45,809 : INFO : PROGRESS: processing document #10000
2018-07-22 18:28:46,096 : INFO : PROGRESS: processing document #20000
2018-07-22 18:28:46,375 : INFO : PROGRESS: processing document #30000
2018-07-22 18:28:46,680 : INFO : PROGRESS: processing document #40000
2018-07-22 18:28:46,964 : INFO : PROGRESS: processing document #50000
2018-07-22 18:28:47,249 : INFO : PROGRESS: processing document #60000
2018-07-22 18:28:47,544 : INFO : PROGRESS: processing document #70000
2018-07-22 18:28:47,851 : INFO : PROGRESS: processing document #80000
2018-07-22 18:28:48,184 : INFO : PROGRESS: processing document #90000
2018-07-2

2018-07-22 18:29:35,754 : INFO : PROGRESS: processing document #1120000
2018-07-22 18:29:36,280 : INFO : PROGRESS: processing document #1130000
2018-07-22 18:29:36,646 : INFO : PROGRESS: processing document #1140000
2018-07-22 18:29:37,006 : INFO : PROGRESS: processing document #1150000
2018-07-22 18:29:37,365 : INFO : PROGRESS: processing document #1160000
2018-07-22 18:29:37,731 : INFO : PROGRESS: processing document #1170000
2018-07-22 18:29:38,097 : INFO : PROGRESS: processing document #1180000
2018-07-22 18:29:38,526 : INFO : PROGRESS: processing document #1190000
2018-07-22 18:29:38,932 : INFO : PROGRESS: processing document #1200000
2018-07-22 18:29:39,322 : INFO : PROGRESS: processing document #1210000
2018-07-22 18:29:39,721 : INFO : PROGRESS: processing document #1220000
2018-07-22 18:29:40,100 : INFO : PROGRESS: processing document #1230000
2018-07-22 18:29:40,616 : INFO : PROGRESS: processing document #1240000
2018-07-22 18:29:40,999 : INFO : PROGRESS: processing document #

2018-07-22 18:30:23,611 : INFO : PROGRESS: processing document #2260000
2018-07-22 18:30:24,029 : INFO : PROGRESS: processing document #2270000
2018-07-22 18:30:24,474 : INFO : PROGRESS: processing document #2280000
2018-07-22 18:30:24,959 : INFO : PROGRESS: processing document #2290000
2018-07-22 18:30:25,374 : INFO : PROGRESS: processing document #2300000
2018-07-22 18:30:25,787 : INFO : PROGRESS: processing document #2310000
2018-07-22 18:30:26,190 : INFO : PROGRESS: processing document #2320000
2018-07-22 18:30:26,597 : INFO : PROGRESS: processing document #2330000
2018-07-22 18:30:27,034 : INFO : PROGRESS: processing document #2340000
2018-07-22 18:30:27,416 : INFO : PROGRESS: processing document #2350000
2018-07-22 18:30:27,810 : INFO : PROGRESS: processing document #2360000
2018-07-22 18:30:28,208 : INFO : PROGRESS: processing document #2370000
2018-07-22 18:30:28,739 : INFO : PROGRESS: processing document #2380000
2018-07-22 18:30:29,265 : INFO : PROGRESS: processing document #

2018-07-22 18:31:36,202 : INFO : PROGRESS: processing document #3400000
2018-07-22 18:31:37,139 : INFO : PROGRESS: processing document #3410000
2018-07-22 18:31:37,937 : INFO : PROGRESS: processing document #3420000
2018-07-22 18:31:38,810 : INFO : PROGRESS: processing document #3430000
2018-07-22 18:31:39,705 : INFO : PROGRESS: processing document #3440000
2018-07-22 18:31:40,322 : INFO : PROGRESS: processing document #3450000
2018-07-22 18:31:40,801 : INFO : PROGRESS: processing document #3460000
2018-07-22 18:31:41,231 : INFO : PROGRESS: processing document #3470000
2018-07-22 18:31:41,699 : INFO : PROGRESS: processing document #3480000
2018-07-22 18:31:42,108 : INFO : PROGRESS: processing document #3490000
2018-07-22 18:31:42,484 : INFO : PROGRESS: processing document #3500000
2018-07-22 18:31:42,867 : INFO : PROGRESS: processing document #3510000
2018-07-22 18:31:43,300 : INFO : PROGRESS: processing document #3520000
2018-07-22 18:31:43,801 : INFO : PROGRESS: processing document #

2018-07-22 18:32:31,480 : INFO : PROGRESS: processing document #4540000
2018-07-22 18:32:32,936 : INFO : PROGRESS: processing document #4550000
2018-07-22 18:32:34,944 : INFO : PROGRESS: processing document #4560000
2018-07-22 18:32:36,355 : INFO : PROGRESS: processing document #4570000
2018-07-22 18:32:37,426 : INFO : PROGRESS: processing document #4580000
2018-07-22 18:32:38,072 : INFO : PROGRESS: processing document #4590000
2018-07-22 18:32:39,310 : INFO : PROGRESS: processing document #4600000
2018-07-22 18:32:39,816 : INFO : PROGRESS: processing document #4610000
2018-07-22 18:32:40,255 : INFO : PROGRESS: processing document #4620000
2018-07-22 18:32:40,686 : INFO : PROGRESS: processing document #4630000
2018-07-22 18:32:41,111 : INFO : PROGRESS: processing document #4640000
2018-07-22 18:32:41,987 : INFO : PROGRESS: processing document #4650000
2018-07-22 18:32:43,217 : INFO : PROGRESS: processing document #4660000
2018-07-22 18:32:44,231 : INFO : PROGRESS: processing document #

2018-07-22 18:33:32,971 : INFO : PROGRESS: processing document #5680000
2018-07-22 18:33:33,428 : INFO : PROGRESS: processing document #5690000
2018-07-22 18:33:33,907 : INFO : PROGRESS: processing document #5700000
2018-07-22 18:33:34,321 : INFO : PROGRESS: processing document #5710000
2018-07-22 18:33:34,738 : INFO : PROGRESS: processing document #5720000
2018-07-22 18:33:35,137 : INFO : PROGRESS: processing document #5730000
2018-07-22 18:33:35,663 : INFO : PROGRESS: processing document #5740000
2018-07-22 18:33:36,166 : INFO : PROGRESS: processing document #5750000
2018-07-22 18:33:36,628 : INFO : PROGRESS: processing document #5760000
2018-07-22 18:33:37,166 : INFO : PROGRESS: processing document #5770000
2018-07-22 18:33:37,749 : INFO : PROGRESS: processing document #5780000
2018-07-22 18:33:38,450 : INFO : PROGRESS: processing document #5790000
2018-07-22 18:33:39,239 : INFO : PROGRESS: processing document #5800000
2018-07-22 18:33:39,774 : INFO : PROGRESS: processing document #

2018-07-22 18:34:32,165 : INFO : PROGRESS: processing document #6820000
2018-07-22 18:34:32,548 : INFO : PROGRESS: processing document #6830000
2018-07-22 18:34:32,923 : INFO : PROGRESS: processing document #6840000
2018-07-22 18:34:33,289 : INFO : PROGRESS: processing document #6850000
2018-07-22 18:34:33,663 : INFO : PROGRESS: processing document #6860000
2018-07-22 18:34:34,031 : INFO : PROGRESS: processing document #6870000
2018-07-22 18:34:34,402 : INFO : PROGRESS: processing document #6880000
2018-07-22 18:34:34,770 : INFO : PROGRESS: processing document #6890000
2018-07-22 18:34:35,151 : INFO : PROGRESS: processing document #6900000
2018-07-22 18:34:35,500 : INFO : PROGRESS: processing document #6910000
2018-07-22 18:34:35,862 : INFO : PROGRESS: processing document #6920000
2018-07-22 18:34:36,246 : INFO : PROGRESS: processing document #6930000
2018-07-22 18:34:36,612 : INFO : PROGRESS: processing document #6940000
2018-07-22 18:34:36,981 : INFO : PROGRESS: processing document #

2018-07-22 18:35:18,855 : INFO : PROGRESS: processing document #7960000
2018-07-22 18:35:19,221 : INFO : PROGRESS: processing document #7970000
2018-07-22 18:35:19,573 : INFO : PROGRESS: processing document #7980000
2018-07-22 18:35:19,924 : INFO : PROGRESS: processing document #7990000
2018-07-22 18:35:20,289 : INFO : PROGRESS: processing document #8000000
2018-07-22 18:35:20,639 : INFO : PROGRESS: processing document #8010000
2018-07-22 18:35:21,005 : INFO : PROGRESS: processing document #8020000
2018-07-22 18:35:21,340 : INFO : PROGRESS: processing document #8030000
2018-07-22 18:35:21,688 : INFO : PROGRESS: processing document #8040000
2018-07-22 18:35:22,071 : INFO : PROGRESS: processing document #8050000
2018-07-22 18:35:22,576 : INFO : PROGRESS: processing document #8060000
2018-07-22 18:35:22,952 : INFO : PROGRESS: processing document #8070000
2018-07-22 18:35:23,303 : INFO : PROGRESS: processing document #8080000
2018-07-22 18:35:23,641 : INFO : PROGRESS: processing document #

In [11]:
def getIdfFeature():
    # key: (qid, docid, rel)
    # value: idf -> contains sum, max, min
    
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)

        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    if dictionary.token2id.get(query_term) != None:
                        term_id = dictionary.token2id[query_term]
                        score_list.append(tfidfmodel.idfs[term_id])
                    else:
                        score_list.append(0)
                else:
                    score_list.append(0)
                    
            sum_idf = sum(score_list)
            max_idf = max(score_list)
            min_idf = min(score_list)

            scores[(qid, docid, 1)] = (sum_idf, max_idf, min_idf)
            
    return scores

In [12]:
def getTfFeature():
    # key: (qid, docid, rel)
    # value: tf -> contains sum, max, min, mean
    
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)          
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    score_list.append(doctext.count(query_term))
                else:
                    score_list.append(0)

            sum_tf = sum(score_list)
            max_tf = max(score_list)
            min_tf = min(score_list)
            scores[(qid, docid, 1)] = (sum_tf, max_tf, min_tf)

    return scores

In [13]:
def getTfIdfFeature():
    # key: (qid, docid, rel)
    # value: tf -> contains sum, max, min, mean
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0.)              
            doctext = collections[docid].lower().split()
            doc_vector = tfidfmodel[dictionary.doc2bow(doctext)]            
            for query_term in query_list:
                if dictionary.token2id.get(query_term) != None:
                    s = [t[1] for t in doc_vector if t[0] == dictionary.token2id.get(query_term)]
                    if len(s) > 0:
                        score_list.append(s[0])
                    else:
                        score_list.append(0.)
                else:
                    score_list.append(0.)
            
            try:
                sum_tfidf = sum(score_list)
                max_tfidf = max(score_list)
                min_tfidf = min(score_list)
                mean_tfidf = sum_tfidf/len(doctext)
            except:
                mean_tfidf = 0.
                import pdb;pdb.set_trace()
                
            scores[(qid, docid, 1)] = (sum_tfidf, max_tfidf, min_tfidf,mean_tfidf)


    return scores

In [6]:
def getQueryLenFeature():
    # key: (qid, docid, rel)
    # value: query len
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            scores[(qid, docid, 1)] = len(query_list)

    return scores

In [8]:
import pickle

# Combining all the features
    
bm25_scores = getBM25Feature()
pickle.dump( bm25_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\bm25_features_eval_15.p", "wb" ) )

# doclen_scores = getDocLengthFeature()
# pickle.dump( doclen_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\doclen_features_eval_15.p", "wb" ) )

# coverage_scores = getCoverageFeature()
# pickle.dump( coverage_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\coverage_features_eval_15.p", "wb" ) )

# idf_scores = getIdfFeature()
# pickle.dump( idf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\idf_features_eval_15.p", "wb" ) )

# tf_scores = getTfFeature()
# pickle.dump( tf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tf_features_eval_15.p", "wb" ) )

# tfidf_scores = getTfIdfFeature()
# pickle.dump( tfidf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tfidf_features_eval_15.p", "wb" ) )

# querylen_scores = getQueryLenFeature()
# pickle.dump( querylen_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\querylen_features_eval_15.p", "wb" ) )