In [1]:
import sys
from importlib import reload
import logging
import csv
from nltk.stem.porter import PorterStemmer
import string
import gensim
import pickle
from gensim.summarization.bm25 import BM25


porterStemmer = PorterStemmer()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def removeNonAscii(s):
    return "".join(filter(lambda x:ord(x) < 128, s))


def preprocess(passage):
    clean_passage = passage.lower()
    clean_passage = removeNonAscii(clean_passage)
    clean_passage = clean_passage.translate(str.maketrans('', '', string.punctuation)) 
    
    clean_passage_list = clean_passage.split()
    clean_passage_list_stem = []
    for word in clean_passage_list:
        if word not in gensim.parsing.preprocessing.STOPWORDS:
            clean_passage_list_stem.append(porterStemmer.stem(word))
    return clean_passage_list_stem



In [2]:
from collections import defaultdict
import csv

In [3]:
collections = defaultdict(str)
with open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\collection_cleaned.tsv") as fr:
    reader = csv.reader(fr, delimiter="\t")
    for row in reader:
        collections[int(row[0])] = row[1]

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.tsv'

In [4]:
queryFile = "C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\top1000.eval.tsv"
query_text = defaultdict(str)
qrels = defaultdict(list)
with open(queryFile, 'r', encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile, delimiter='\t')
    for row in reader:
        query_text[int(row[0])] = row[2]
        qrels[int(row[0])].append(int(row[1]))

In [5]:
print(len(query_text))
print(len(qrels))

6837
6837


In [6]:
import pickle

with open("C:\\Users\\mapyredd\\Documents\\marco\\data\\model\\bm25.pkl", 'rb') as fr:
    bm25 = pickle.load(fr)
    
average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) 

In [7]:
def getBM25Feature():
    # key: (qid, docid, rel)
    # value: score 
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = bm25.get_score(query_list, docid, average_idf)
            scores[(qid, docid, 1)] = score
            
    return scores

In [8]:
def getDocLengthFeature():
    # key: (qid, docid, rel)
    # value: doc length
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        for docid in docid_list:
            score = len(collections[docid].split())
            scores[(qid, docid, 1)] = score

    return scores

In [9]:
def getCoverageFeature():
    # key: (qid, docid, rel)
    # value: score -> contains query term coverage and ratio
    zero_length_queries = 0
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = len(set(query_list).intersection(set(collections[docid].split())))
            scores[(qid, docid, 1)] = (score, float(score)/(len(query_list) + 1))
            if len(query_list) == 0:
                zero_length_queries += 1
    
    print (zero_length_queries)
    
    return scores

In [None]:
def getAndFeature():
    # key: (qid, docid, rel)
    # value: score -> contains query term converage and ratio
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            terms = collections[docid].split()
            andScore = set(query_list).issubset(terms)
            scores[(qid, docid, 1)] = 1 if andScore else 0
            
    return scores 

In [None]:
def getAnd2Feature():
    # key: (qid, docid, rel)
    # value: score -> contains query term converage and ratio
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            terms = collections[docid].split()
            total_word_count = 0
            all_verbs_found = True
            for query in query_list:
                word_count = sum(term == query for term in terms)
                if word_count == 0:
                    all_verbs_found = False
                    break
                else:
                    total_word_count = total_word_count + word_count
                    
            scores[(qid, docid, 1)] = total_word_count if all_verbs_found else 0
        
    return scores

In [10]:
from gensim.models import TfidfModel
from gensim import corpora
from gensim.corpora import Dictionary

dictionary = corpora.Dictionary.load("C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.dict")

class MyCorpus(object):
    def __iter__(self):
        with open('C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.tsv') as fr:
            reader = csv.reader(fr, delimiter = "\t")
            for row in reader:
                # assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(row[1].lower().split())

tfidfmodel = TfidfModel(MyCorpus())

2018-07-23 18:06:45,944 : INFO : loading Dictionary object from C:\Users\mapyredd\Documents\marco\data\collection_cleaned.dict
2018-07-23 18:06:47,950 : INFO : loaded C:\Users\mapyredd\Documents\marco\data\collection_cleaned.dict
2018-07-23 18:06:47,966 : INFO : collecting document frequencies
2018-07-23 18:06:47,995 : INFO : PROGRESS: processing document #0
2018-07-23 18:06:48,351 : INFO : PROGRESS: processing document #10000
2018-07-23 18:06:48,698 : INFO : PROGRESS: processing document #20000
2018-07-23 18:06:49,298 : INFO : PROGRESS: processing document #30000
2018-07-23 18:06:49,613 : INFO : PROGRESS: processing document #40000
2018-07-23 18:06:50,051 : INFO : PROGRESS: processing document #50000
2018-07-23 18:06:50,530 : INFO : PROGRESS: processing document #60000
2018-07-23 18:06:51,305 : INFO : PROGRESS: processing document #70000
2018-07-23 18:06:52,016 : INFO : PROGRESS: processing document #80000
2018-07-23 18:06:52,928 : INFO : PROGRESS: processing document #90000
2018-07-2

2018-07-23 18:07:41,922 : INFO : PROGRESS: processing document #1120000
2018-07-23 18:07:42,286 : INFO : PROGRESS: processing document #1130000
2018-07-23 18:07:42,654 : INFO : PROGRESS: processing document #1140000
2018-07-23 18:07:43,023 : INFO : PROGRESS: processing document #1150000
2018-07-23 18:07:43,402 : INFO : PROGRESS: processing document #1160000
2018-07-23 18:07:43,775 : INFO : PROGRESS: processing document #1170000
2018-07-23 18:07:44,140 : INFO : PROGRESS: processing document #1180000
2018-07-23 18:07:44,540 : INFO : PROGRESS: processing document #1190000
2018-07-23 18:07:44,904 : INFO : PROGRESS: processing document #1200000
2018-07-23 18:07:45,356 : INFO : PROGRESS: processing document #1210000
2018-07-23 18:07:45,725 : INFO : PROGRESS: processing document #1220000
2018-07-23 18:07:46,104 : INFO : PROGRESS: processing document #1230000
2018-07-23 18:07:46,542 : INFO : PROGRESS: processing document #1240000
2018-07-23 18:07:47,177 : INFO : PROGRESS: processing document #

2018-07-23 18:08:31,795 : INFO : PROGRESS: processing document #2260000
2018-07-23 18:08:32,179 : INFO : PROGRESS: processing document #2270000
2018-07-23 18:08:32,564 : INFO : PROGRESS: processing document #2280000
2018-07-23 18:08:32,949 : INFO : PROGRESS: processing document #2290000
2018-07-23 18:08:33,347 : INFO : PROGRESS: processing document #2300000
2018-07-23 18:08:33,712 : INFO : PROGRESS: processing document #2310000
2018-07-23 18:08:34,101 : INFO : PROGRESS: processing document #2320000
2018-07-23 18:08:34,498 : INFO : PROGRESS: processing document #2330000
2018-07-23 18:08:34,866 : INFO : PROGRESS: processing document #2340000
2018-07-23 18:08:35,282 : INFO : PROGRESS: processing document #2350000
2018-07-23 18:08:35,652 : INFO : PROGRESS: processing document #2360000
2018-07-23 18:08:36,015 : INFO : PROGRESS: processing document #2370000
2018-07-23 18:08:36,469 : INFO : PROGRESS: processing document #2380000
2018-07-23 18:08:36,853 : INFO : PROGRESS: processing document #

2018-07-23 18:09:23,092 : INFO : PROGRESS: processing document #3400000
2018-07-23 18:09:23,476 : INFO : PROGRESS: processing document #3410000
2018-07-23 18:09:23,846 : INFO : PROGRESS: processing document #3420000
2018-07-23 18:09:24,240 : INFO : PROGRESS: processing document #3430000
2018-07-23 18:09:24,609 : INFO : PROGRESS: processing document #3440000
2018-07-23 18:09:24,990 : INFO : PROGRESS: processing document #3450000
2018-07-23 18:09:25,369 : INFO : PROGRESS: processing document #3460000
2018-07-23 18:09:25,737 : INFO : PROGRESS: processing document #3470000
2018-07-23 18:09:26,116 : INFO : PROGRESS: processing document #3480000
2018-07-23 18:09:26,514 : INFO : PROGRESS: processing document #3490000
2018-07-23 18:09:26,882 : INFO : PROGRESS: processing document #3500000
2018-07-23 18:09:27,267 : INFO : PROGRESS: processing document #3510000
2018-07-23 18:09:27,647 : INFO : PROGRESS: processing document #3520000
2018-07-23 18:09:28,013 : INFO : PROGRESS: processing document #

2018-07-23 18:10:13,616 : INFO : PROGRESS: processing document #4540000
2018-07-23 18:10:14,018 : INFO : PROGRESS: processing document #4550000
2018-07-23 18:10:14,449 : INFO : PROGRESS: processing document #4560000
2018-07-23 18:10:14,834 : INFO : PROGRESS: processing document #4570000
2018-07-23 18:10:15,219 : INFO : PROGRESS: processing document #4580000
2018-07-23 18:10:15,604 : INFO : PROGRESS: processing document #4590000
2018-07-23 18:10:15,973 : INFO : PROGRESS: processing document #4600000
2018-07-23 18:10:16,389 : INFO : PROGRESS: processing document #4610000
2018-07-23 18:10:16,837 : INFO : PROGRESS: processing document #4620000
2018-07-23 18:10:17,226 : INFO : PROGRESS: processing document #4630000
2018-07-23 18:10:17,612 : INFO : PROGRESS: processing document #4640000
2018-07-23 18:10:17,975 : INFO : PROGRESS: processing document #4650000
2018-07-23 18:10:18,359 : INFO : PROGRESS: processing document #4660000
2018-07-23 18:10:18,729 : INFO : PROGRESS: processing document #

2018-07-23 18:11:02,684 : INFO : PROGRESS: processing document #5680000
2018-07-23 18:11:03,554 : INFO : PROGRESS: processing document #5690000
2018-07-23 18:11:04,152 : INFO : PROGRESS: processing document #5700000
2018-07-23 18:11:04,800 : INFO : PROGRESS: processing document #5710000
2018-07-23 18:11:05,461 : INFO : PROGRESS: processing document #5720000
2018-07-23 18:11:06,108 : INFO : PROGRESS: processing document #5730000
2018-07-23 18:11:06,709 : INFO : PROGRESS: processing document #5740000
2018-07-23 18:11:07,372 : INFO : PROGRESS: processing document #5750000
2018-07-23 18:11:07,984 : INFO : PROGRESS: processing document #5760000
2018-07-23 18:11:08,500 : INFO : PROGRESS: processing document #5770000
2018-07-23 18:11:08,917 : INFO : PROGRESS: processing document #5780000
2018-07-23 18:11:09,318 : INFO : PROGRESS: processing document #5790000
2018-07-23 18:11:09,856 : INFO : PROGRESS: processing document #5800000
2018-07-23 18:11:10,256 : INFO : PROGRESS: processing document #

2018-07-23 18:11:53,015 : INFO : PROGRESS: processing document #6820000
2018-07-23 18:11:53,416 : INFO : PROGRESS: processing document #6830000
2018-07-23 18:11:53,797 : INFO : PROGRESS: processing document #6840000
2018-07-23 18:11:54,162 : INFO : PROGRESS: processing document #6850000
2018-07-23 18:11:54,546 : INFO : PROGRESS: processing document #6860000
2018-07-23 18:11:54,931 : INFO : PROGRESS: processing document #6870000
2018-07-23 18:11:55,312 : INFO : PROGRESS: processing document #6880000
2018-07-23 18:11:55,694 : INFO : PROGRESS: processing document #6890000
2018-07-23 18:11:56,064 : INFO : PROGRESS: processing document #6900000
2018-07-23 18:11:56,448 : INFO : PROGRESS: processing document #6910000
2018-07-23 18:11:56,817 : INFO : PROGRESS: processing document #6920000
2018-07-23 18:11:57,217 : INFO : PROGRESS: processing document #6930000
2018-07-23 18:11:57,595 : INFO : PROGRESS: processing document #6940000
2018-07-23 18:11:57,979 : INFO : PROGRESS: processing document #

2018-07-23 18:12:42,820 : INFO : PROGRESS: processing document #7960000
2018-07-23 18:12:43,189 : INFO : PROGRESS: processing document #7970000
2018-07-23 18:12:43,587 : INFO : PROGRESS: processing document #7980000
2018-07-23 18:12:43,965 : INFO : PROGRESS: processing document #7990000
2018-07-23 18:12:44,337 : INFO : PROGRESS: processing document #8000000
2018-07-23 18:12:44,706 : INFO : PROGRESS: processing document #8010000
2018-07-23 18:12:45,087 : INFO : PROGRESS: processing document #8020000
2018-07-23 18:12:45,437 : INFO : PROGRESS: processing document #8030000
2018-07-23 18:12:45,807 : INFO : PROGRESS: processing document #8040000
2018-07-23 18:12:46,153 : INFO : PROGRESS: processing document #8050000
2018-07-23 18:12:46,538 : INFO : PROGRESS: processing document #8060000
2018-07-23 18:12:46,890 : INFO : PROGRESS: processing document #8070000
2018-07-23 18:12:47,237 : INFO : PROGRESS: processing document #8080000
2018-07-23 18:12:47,592 : INFO : PROGRESS: processing document #

In [11]:
def getIdfFeature():
    # key: (qid, docid, rel)
    # value: idf -> contains sum, max, min
    
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)

        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    if dictionary.token2id.get(query_term) != None:
                        term_id = dictionary.token2id[query_term]
                        score_list.append(tfidfmodel.idfs[term_id])
                    else:
                        score_list.append(0)
                else:
                    score_list.append(0)
                    
            sum_idf = sum(score_list)
            max_idf = max(score_list)
            min_idf = min(score_list)

            scores[(qid, docid, 1)] = (sum_idf, max_idf, min_idf)
            
    return scores

In [12]:
def getTfFeature():
    # key: (qid, docid, rel)
    # value: tf -> contains sum, max, min, mean
    
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)          
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    score_list.append(doctext.count(query_term))
                else:
                    score_list.append(0)

            sum_tf = sum(score_list)
            max_tf = max(score_list)
            min_tf = min(score_list)
            scores[(qid, docid, 1)] = (sum_tf, max_tf, min_tf)

    return scores

In [13]:
def getTfIdfFeature():
    # key: (qid, docid, rel)
    # value: tf -> contains sum, max, min, mean
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0.)              
            doctext = collections[docid].lower().split()
            doc_vector = tfidfmodel[dictionary.doc2bow(doctext)]            
            for query_term in query_list:
                if dictionary.token2id.get(query_term) != None:
                    s = [t[1] for t in doc_vector if t[0] == dictionary.token2id.get(query_term)]
                    if len(s) > 0:
                        score_list.append(s[0])
                    else:
                        score_list.append(0.)
                else:
                    score_list.append(0.)
            
            try:
                sum_tfidf = sum(score_list)
                max_tfidf = max(score_list)
                min_tfidf = min(score_list)
                mean_tfidf = sum_tfidf/len(doctext)
            except:
                mean_tfidf = 0.
                import pdb;pdb.set_trace()
                
            scores[(qid, docid, 1)] = (sum_tfidf, max_tfidf, min_tfidf,mean_tfidf)


    return scores

In [14]:
def getQueryLenFeature():
    # key: (qid, docid, rel)
    # value: query len
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            scores[(qid, docid, 1)] = len(query_list)

    return scores

In [15]:
import pickle

# Combining all the features
    
bm25_scores = getBM25Feature()
pickle.dump( bm25_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\bm25_features_eval_15.p", "wb" ) )

doclen_scores = getDocLengthFeature()
pickle.dump( doclen_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\doclen_features_eval_15.p", "wb" ) )

coverage_scores = getCoverageFeature()
pickle.dump( coverage_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\coverage_features_eval_15.p", "wb" ) )

idf_scores = getIdfFeature()
pickle.dump( idf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\idf_features_eval_15.p", "wb" ) )

tf_scores = getTfFeature()
pickle.dump( tf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tf_features_eval_15.p", "wb" ) )

tfidf_scores = getTfIdfFeature()
pickle.dump( tfidf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tfidf_features_eval_15.p", "wb" ) )

# querylen_scores = getQueryLenFeature()
# pickle.dump( querylen_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\querylen_features_eval.p", "wb" ) )

and_scores = getAndFeature()
pickle.dump(and_scores, open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\and_features_eval.p", "wb"))

and2_scores = getAnd2Feature()
pickle.dump(and2_scores, open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\and2_features_eval.p", "wb"))

15
