In [14]:
import sys
from importlib import reload
import logging
import csv
from nltk.stem.porter import PorterStemmer
import string
import gensim
import pickle
from gensim.summarization.bm25 import BM25


porterStemmer = PorterStemmer()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def removeNonAscii(s):
    return "".join(filter(lambda x:ord(x) < 128, s))


def preprocess(passage):
    clean_passage = passage.lower()
    clean_passage = removeNonAscii(clean_passage)
    clean_passage = clean_passage.translate(str.maketrans('', '', string.punctuation)) 
    
    clean_passage_list = clean_passage.split()
    clean_passage_list_stem = []
    for word in clean_passage_list:
        if word not in gensim.parsing.preprocessing.STOPWORDS:
            clean_passage_list_stem.append(porterStemmer.stem(word))
    return clean_passage_list_stem




In [15]:
from collections import defaultdict
import csv


In [2]:
collections = defaultdict(str)
with open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\collection_cleaned.tsv") as fr:
    reader = csv.reader(fr, delimiter="\t")
    for row in reader:
        collections[int(row[0])] = row[1]

In [3]:
from collections import defaultdict
import csv

query_text = defaultdict(str)
with open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\queries.dev.tsv", encoding="UTF8") as fr:
    reader = csv.reader(fr, delimiter="\t")
    for row in reader:
        query_text[int(row[0])] = row[1]

In [4]:
qrels = defaultdict(list)
with open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\qrels.dev.tsv") as fr:
    reader = csv.reader(fr, delimiter="\t")
    for row in reader:
        qrels[int(row[0])].append(int(row[2]))

qrels_nr = defaultdict(list)
with open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\qrels_nr.dev.tsv") as fr:
    reader = csv.reader(fr, delimiter="\t")
    for row in reader:
        qrels_nr[int(row[0])].append(int(row[2]))

In [12]:
import pickle

with open("C:\\Users\\mapyredd\\Documents\\marco\\data\\model\\bm25.pkl", 'rb') as fr:
    bm25 = pickle.load(fr)
    
average_idf = sum(map(lambda k: float(bm25.idf[k]), bm25.idf.keys())) / len(bm25.idf.keys()) 

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\mapyredd\\Documents\\marco\\data\\model\\bm25.pkl'

In [6]:
def getBM25Feature():
    # key: (qid, docid, rel)
    # value: score 
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = bm25.get_score(query_list, docid, average_idf)
            scores[(qid, docid, 1)] = score
            
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = bm25.get_score(query_list, docid, average_idf)
            scores[(qid, docid, 0)] = score
    return scores

In [7]:
def getDocLengthFeature():
    # key: (qid, docid, rel)
    # value: doc length
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        for docid in docid_list:
            score = len(collections[docid].split())
            scores[(qid, docid, 1)] = score
            
    for qid, docid_list in qrels_nr.items():
        for docid in docid_list:
            score = len(collections[docid].split())
            scores[(qid, docid, 0)] = score
    return scores

In [8]:
def getCoverageFeature():
    # key: (qid, docid, rel)
    # value: score -> contains query term coverage and ratio
    zero_length_queries = 0
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = len(set(query_list).intersection(set(collections[docid].split())))
            scores[(qid, docid, 1)] = (score, float(score)/(len(query_list) + 1))
            if len(query_list) == 0:
                zero_length_queries += 1
    
    print (zero_length_queries)
    
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score = len(set(query_list).intersection(set(collections[docid].split())))
            scores[(qid, docid, 0)] = (score, float(score)/(len(query_list) + 1))
            
    return scores

In [5]:
def getAndFeature():
    # key: (qid, docid, rel)
    # value: score -> contains query term converage and ratio
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            terms = collections[docid].split()
            andScore = set(query_list).issubset(terms)
            scores[(qid, docid, 1)] = 1 if andScore else 0
            
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            terms = collections[docid].split()
            andScore = set(query_list).issubset(terms)
            scores[(qid, docid, 0)] = 1 if andScore else 0
            
    return scores      
            

In [10]:
def getAnd2Feature():
    # key: (qid, docid, rel)
    # value: score -> contains query term converage and ratio
    scores = defaultdict(float)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            terms = collections[docid].split()
            total_word_count = 0
            all_verbs_found = True
            for query in query_list:
                word_count = sum(term == query for term in terms)
                if word_count == 0:
                    all_verbs_found = False
                    break
                else:
                    total_word_count = total_word_count + word_count
                    
            scores[(qid, docid, 1)] = total_word_count if all_verbs_found else 0
                
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            terms = collections[docid].split()
            total_word_count = 0
            all_verbs_found = True
            for query in query_list:
                word_count = sum(term == query for term in terms)
                if word_count == 0:
                    all_verbs_found = False
                    break
                else:
                    total_word_count = total_word_count + word_count
                    
            scores[(qid, docid, 0)] = total_word_count if all_verbs_found else 0 
        
    return scores

In [30]:
import sys
import csv
import logging
import pandas as pd
import gensim
from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
csv.field_size_limit(10000000)


# Load raw text corpus and yield preprocessed text
class MyTextCorpus(object):
    def __init__(self, inputFile):
        self.inputFile = inputFile
        
    def __iter__(self):
        with open(self.inputFile, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            for row in reader:
                yield row[1].split()


inputFile = "C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.tsv"
outputFile = "C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.dict"
corpus_text_memory_friendly = MyTextCorpus(inputFile)

# remove stop words and numbers. Form a dictionary
dictionary = corpora.Dictionary([doc for doc in corpus_text_memory_friendly])
dictionary.save(outputFile)

2018-07-21 21:44:03,664 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-07-21 21:44:05,293 : INFO : adding document #10000 to Dictionary(26896 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:06,217 : INFO : adding document #20000 to Dictionary(41470 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:07,746 : INFO : adding document #30000 to Dictionary(54299 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:09,040 : INFO : adding document #40000 to Dictionary(65605 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:10,363 : INFO : adding document #50000 to Dictionary(76310 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:11,462 : INFO : adding document #60000 to Dictionary(86163 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:13,759 : INFO : adding document #70000 to Dictiona

2018-07-21 21:44:55,658 : INFO : adding document #580000 to Dictionary(450015 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:56,499 : INFO : adding document #590000 to Dictionary(455943 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:57,846 : INFO : adding document #600000 to Dictionary(461800 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:58,882 : INFO : adding document #610000 to Dictionary(467317 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:44:59,847 : INFO : adding document #620000 to Dictionary(472992 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:02,889 : INFO : adding document #630000 to Dictionary(478747 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:04,409 : INFO : adding document #640000 to Dictionary(484486 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)

2018-07-21 21:45:54,554 : INFO : adding document #1160000 to Dictionary(757372 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:55,349 : INFO : adding document #1170000 to Dictionary(762525 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:55,969 : INFO : adding document #1180000 to Dictionary(767639 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:56,478 : INFO : adding document #1190000 to Dictionary(772451 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:56,988 : INFO : adding document #1200000 to Dictionary(777549 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:45:59,632 : INFO : adding document #1210000 to Dictionary(782563 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:00,176 : INFO : adding document #1220000 to Dictionary(787305 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commu

2018-07-21 21:46:44,751 : INFO : adding document #1730000 to Dictionary(1023766 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:45,861 : INFO : adding document #1740000 to Dictionary(1028258 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:46,980 : INFO : adding document #1750000 to Dictionary(1032339 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:48,000 : INFO : adding document #1760000 to Dictionary(1036972 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:49,139 : INFO : adding document #1770000 to Dictionary(1041506 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:50,203 : INFO : adding document #1780000 to Dictionary(1045803 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:46:51,954 : INFO : adding document #1790000 to Dictionary(1050149 unique tokens: ['achiev', 'amid', 'atom', 'cloud',

2018-07-21 21:47:40,886 : INFO : adding document #2300000 to Dictionary(1265489 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:47:41,562 : INFO : adding document #2310000 to Dictionary(1269245 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:47:42,187 : INFO : adding document #2320000 to Dictionary(1273429 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:47:42,926 : INFO : adding document #2330000 to Dictionary(1277516 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:47:43,607 : INFO : adding document #2340000 to Dictionary(1281615 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:47:44,506 : INFO : adding document #2350000 to Dictionary(1285647 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:47:45,184 : INFO : adding document #2360000 to Dictionary(1290049 unique tokens: ['achiev', 'amid', 'atom', 'cloud',

2018-07-21 21:48:26,265 : INFO : adding document #2870000 to Dictionary(1493066 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:26,706 : INFO : adding document #2880000 to Dictionary(1496990 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:27,267 : INFO : adding document #2890000 to Dictionary(1500959 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:27,832 : INFO : adding document #2900000 to Dictionary(1505005 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:28,368 : INFO : adding document #2910000 to Dictionary(1509176 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:28,978 : INFO : adding document #2920000 to Dictionary(1513108 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:29,531 : INFO : adding document #2930000 to Dictionary(1517093 unique tokens: ['achiev', 'amid', 'atom', 'cloud',

2018-07-21 21:48:58,920 : INFO : adding document #3440000 to Dictionary(1707662 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:59,274 : INFO : adding document #3450000 to Dictionary(1711060 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:48:59,712 : INFO : adding document #3460000 to Dictionary(1714728 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:00,187 : INFO : adding document #3470000 to Dictionary(1718469 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:00,851 : INFO : adding document #3480000 to Dictionary(1722181 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:01,599 : INFO : adding document #3490000 to Dictionary(1725934 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:02,369 : INFO : adding document #3500000 to Dictionary(1729522 unique tokens: ['achiev', 'amid', 'atom', 'cloud',

2018-07-21 21:49:24,482 : INFO : adding document #4010000 to Dictionary(1912314 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:24,834 : INFO : adding document #4020000 to Dictionary(1915685 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:25,172 : INFO : adding document #4030000 to Dictionary(1919141 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:25,531 : INFO : adding document #4040000 to Dictionary(1922740 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:25,890 : INFO : adding document #4050000 to Dictionary(1926212 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:26,228 : INFO : adding document #4060000 to Dictionary(1929449 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:49:26,574 : INFO : adding document #4070000 to Dictionary(1932763 unique tokens: ['achiev', 'amid', 'atom', 'cloud',

2018-07-21 21:50:05,359 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4330000 (=100.0%) documents
2018-07-21 21:50:09,262 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:50:09,329 : INFO : adding document #4330000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:50:12,846 : INFO : discarding 3391 tokens: [('solidswaterair', 1), ('conglomeratesthat', 1), ('chargeswhat', 1), ('kapenguria', 1), ('ngei', 1), ('196478', 1), ('muhoho', 1), ('karum', 1), ('michuki', 1), ('njenga', 1)]...
2018-07-21 21:50:12,847 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4340000 (=100.0%) documents
2018-07-21 21:50:15,936 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:50:15,971 : INFO : adding document #4340000 to Dictionary(2000000 

2018-07-21 21:51:07,922 : INFO : adding document #4450000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:51:09,951 : INFO : discarding 3364 tokens: [('hanscomb', 1), ('monitoringyour', 1), ('yogayeah', 1), ('bitesst', 1), ('poisoncenterfpicnorg', 1), ('farto', 1), ('quincux', 1), ('productat', 1), ('hexagonalequilater', 1), ('superhighdens', 1)]...
2018-07-21 21:51:09,952 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4460000 (=100.0%) documents
2018-07-21 21:51:12,471 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:51:12,524 : INFO : adding document #4460000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:51:15,280 : INFO : discarding 3663 tokens: [('120529', 1), ('149288', 1), ('36731', 1), ('76665', 1), ('andtruli', 1), ('ireir', 1), ('likealway', 1), ('abraza', 1), (

2018-07-21 21:52:09,870 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4580000 (=100.0%) documents
2018-07-21 21:52:12,132 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:52:12,164 : INFO : adding document #4580000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:52:14,255 : INFO : discarding 3628 tokens: [('jawmunji', 1), ('krystianzo', 1), ('nyasia', 1), ('deadthank', 1), ('headassor', 1), ('isshh', 1), ('stressorshom', 1), ('suicidefor', 1), ('nacetyl5aminosalicyl', 1), ('binarybase2bit', 1)]...
2018-07-21 21:52:14,256 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4590000 (=100.0%) documents
2018-07-21 21:52:16,637 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:52:16,671 : INFO : adding document #4590000 to Di

2018-07-21 21:53:07,317 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:53:07,348 : INFO : adding document #4700000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:53:09,404 : INFO : discarding 3378 tokens: [('rate11', 1), ('resolution10', 1), ('httpwwwongovnet', 1), ('32hen', 1), ('responsehey', 1), ('pathogencquir', 1), ('nonselfthey', 1), ('vulnerabilitiesknown', 1), ('iaik', 1), ('vu584653', 1)]...
2018-07-21 21:53:09,405 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4710000 (=100.0%) documents
2018-07-21 21:53:11,757 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:53:11,790 : INFO : adding document #4710000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:53:13,876 : INFO : discarding 36

2018-07-21 21:54:08,809 : INFO : discarding 3693 tokens: [('3739born', 1), ('imageful', 1), ('malebirth', 1), ('meadowsgend', 1), ('usoccup', 1), ('youtuberrol', 1), ('pinyata', 1), ('843k', 1), ('brigsbybear', 1), ('19781025', 1)]...
2018-07-21 21:54:08,810 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4830000 (=100.0%) documents
2018-07-21 21:54:11,420 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:54:11,457 : INFO : adding document #4830000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:54:13,952 : INFO : discarding 3631 tokens: [('communeand', 1), ('c330', 1), ('geographyencyclopeadia', 1), ('magallanesmagallan', 1), ('magellanmagellan', 1), ('httpnrtccnewsboysshowpr', 1), ('hollywoodlevel', 1), ('httpwwwtlncomshowsnewsboyshtml', 1), ('tlncom', 1), ('behzingahttpwwwyoutubecombehzinga', 1)]...
2018-07-21 21:54:13,952 

2018-07-21 21:55:08,999 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4950000 (=100.0%) documents
2018-07-21 21:55:11,559 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:55:11,597 : INFO : adding document #4950000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:55:13,975 : INFO : discarding 3577 tokens: [('fibromyalgiarel', 1), ('helpshurt', 1), ('chronicil', 1), ('chronicpain', 1), ('facebookcomwincoofficialpag', 1), ('wincofoodscomcontact', 1), ('nonrap', 1), ('040313', 1), ('kurp', 1), ('clerwood', 1)]...
2018-07-21 21:55:13,976 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 4960000 (=100.0%) documents
2018-07-21 21:55:16,266 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:55:16,300 : INFO : adding document #4

2018-07-21 21:56:08,895 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:56:08,928 : INFO : adding document #5070000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:56:11,262 : INFO : discarding 3554 tokens: [('ownersax', 1), ('incomeemploye', 1), ('organizationsani', 1), ('npoani', 1), ('notforprofitin', 1), ('termsstat', 1), ('thoughinc', 1), ('careeronprofit', 1), ('twofamiliar', 1), ('exemptdespit', 1)]...
2018-07-21 21:56:11,263 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5080000 (=100.0%) documents
2018-07-21 21:56:14,056 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:56:14,089 : INFO : adding document #5080000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:56:16,268 : INFO : discar

2018-07-21 21:57:09,661 : INFO : discarding 3767 tokens: [('rainwateraus', 1), ('dustchem', 1), ('eyesburn', 1), ('eyesurn', 1), ('allergensebmd', 1), ('tearsut', 1), ('discharge5', 1), ('itchingn', 1), ('eyewateri', 1), ('productionateri', 1)]...
2018-07-21 21:57:09,661 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5200000 (=100.0%) documents
2018-07-21 21:57:11,951 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:57:11,984 : INFO : adding document #5200000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:57:14,069 : INFO : discarding 3514 tokens: [('weavesar', 1), ('needlesth', 1), ('legsetc', 1), ('iii11274b3241', 1), ('foodbabypeopl', 1), ('4e79', 1), ('6e52', 1), ('missingmass', 1), ('disneyincub', 1), ('dragonchain', 1)]...
2018-07-21 21:57:14,070 : INFO : keeping 2000000 tokens which were in no less than 0 and no more

2018-07-21 21:58:11,888 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5320000 (=100.0%) documents
2018-07-21 21:58:14,606 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:58:14,640 : INFO : adding document #5320000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:58:16,826 : INFO : discarding 3290 tokens: [('130104', 1), ('424175217', 1), ('42717', 1), ('57755', 1), ('iftar2015', 1), ('smccu', 1), ('httpswwwfacebookcomnyciftar', 1), ('sehroiftar', 1), ('candidateswel', 1), ('williamhav', 1)]...
2018-07-21 21:58:16,827 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5330000 (=100.0%) documents
2018-07-21 21:58:19,249 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:58:19,299 : INFO : adding document #5330000 to Dictio

2018-07-21 21:59:14,782 : INFO : adding document #5440000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:59:17,348 : INFO : discarding 3681 tokens: [('weisk', 1), ('fertilizerrea', 1), ('rationsurea', 1), ('ureaoncentr', 1), ('31232', 1), ('40025', 1), ('oufit', 1), ('wwwyoutubecomwallykov', 1), ('facebookcomboo', 1), ('wwwboothedognet', 1)]...
2018-07-21 21:59:17,349 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5450000 (=100.0%) documents
2018-07-21 21:59:19,878 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:59:19,911 : INFO : adding document #5450000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 21:59:22,013 : INFO : discarding 3558 tokens: [('5pde5', 1), ('southmalawi', 1), ('zimbabweozambiqu', 1), ('californiaozambiqu', 1), ('eastmozambiqu', 1), ('southimbabw', 1),

2018-07-21 22:00:14,207 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5570000 (=100.0%) documents
2018-07-21 22:00:16,519 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:00:16,551 : INFO : adding document #5570000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:00:18,716 : INFO : discarding 3428 tokens: [('tenat', 1), ('renthen', 1), ('eitherft', 1), ('throughfter', 1), ('inspectionschedul', 1), ('originalhen', 1), ('walkthroughregardless', 1), ('staysith', 1), ('populationleas', 1), ('sorryani', 1)]...
2018-07-21 22:00:18,717 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5580000 (=100.0%) documents
2018-07-21 22:00:21,159 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:00:21,190 : INFO : adding document #558000

2018-07-21 22:01:21,505 : INFO : adding document #5690000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:01:23,593 : INFO : discarding 3402 tokens: [('managementseag', 1), ('managerdisk', 1), ('rebootnot', 1), ('contributesdear', 1), ('penaltyhowev', 1), ('beermalt', 1), ('spiritsliquor', 1), ('abcncgov', 1), ('abcncgovpermit', 1), ('againyet', 1)]...
2018-07-21 22:01:23,593 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5700000 (=100.0%) documents
2018-07-21 22:01:25,882 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:01:25,914 : INFO : adding document #5700000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:01:28,007 : INFO : discarding 3504 tokens: [('wasp43b', 1), ('hospitalbut', 1), ('privateohip', 1), ('roombathroom', 1), ('roominginwhich', 1), ('insuranceackenzi',

2018-07-21 22:02:19,079 : INFO : discarding 3545 tokens: [('6432350', 1), ('rentoul', 1), ('rangeunct', 1), ('etcecur', 1), ('rlactam', 1), ('weightyanair', 1), ('marryy', 1), ('prisonshil', 1), ('rntp', 1), ('crosssail', 1)]...
2018-07-21 22:02:19,080 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5820000 (=100.0%) documents
2018-07-21 22:02:21,506 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:02:21,539 : INFO : adding document #5820000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:02:23,633 : INFO : discarding 3763 tokens: [('2547248', 1), ('and122', 1), ('maand', 1), ('mned', 1), ('mnoth', 1), ('rejuveniq', 1), ('secgovcomplainttipscomplaintshtml', 1), ('biofunct', 1), ('fcha', 1), ('klesker', 1)]...
2018-07-21 22:02:23,634 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5830000 (=100.0%

2018-07-21 22:03:13,502 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5940000 (=100.0%) documents
2018-07-21 22:03:16,413 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:03:16,446 : INFO : adding document #5940000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:03:18,856 : INFO : discarding 3564 tokens: [('biogasioga', 1), ('bacteriaioga', 1), ('generatordigest', 1), ('mixturesinc', 1), ('biomassioga', 1), ('oxygenioga', 1), ('sourcesioga', 1), ('frac2221', 1), ('fracpi180', 1), ('fractheta360', 1)]...
2018-07-21 22:03:18,857 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 5950000 (=100.0%) documents
2018-07-21 22:03:21,227 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:03:21,317 : INFO : adding document #5950000

2018-07-21 22:04:13,295 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:04:13,328 : INFO : adding document #6060000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:04:15,409 : INFO : discarding 3832 tokens: [('yuqui', 1), ('allazaward', 1), ('azurenet', 1), ('sdkversionschema', 1), ('phmsausdot', 1), ('cilun', 1), ('idoloriginwelshth', 1), ('questsoulurg', 1), ('idollynessa', 1), ('lynessynessa', 1)]...
2018-07-21 22:04:15,410 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6070000 (=100.0%) documents
2018-07-21 22:04:17,694 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:04:17,727 : INFO : adding document #6070000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:04:20,206 : INFO : discarding 3

2018-07-21 22:05:15,240 : INFO : discarding 5663 tokens: [('handlerslodgingh', 1), ('8207501', 1), ('localizedescript', 1), ('herodiasescript', 1), ('largergreat', 1), ('taildentif', 1), ('blusish', 1), ('legsreat', 1), ('seasontatu', 1), ('ciconiiformesmerican', 1)]...
2018-07-21 22:05:15,241 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6190000 (=100.0%) documents
2018-07-21 22:05:17,575 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:05:17,609 : INFO : adding document #6190000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:05:19,805 : INFO : discarding 5797 tokens: [('appliances5', 1), ('circulationthroughout', 1), ('saltyh', 1), ('formcompar', 1), ('mwfor', 1), ('3dbm', 1), ('dbmbecaus', 1), ('pdbm', 1), ('signaldbm', 1), ('powerdbm', 1)]...
2018-07-21 22:05:19,806 : INFO : keeping 2000000 tokens which were in no less

2018-07-21 22:06:18,327 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6310000 (=100.0%) documents
2018-07-21 22:06:20,620 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:06:20,655 : INFO : adding document #6310000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:06:22,897 : INFO : discarding 5558 tokens: [('two1ou', 1), ('songehila', 1), ('praiseworthyth', 1), ('languagetehila', 1), ('tahilla', 1), ('tehilna', 1), ('tehiltaehilla', 1), ('tesillo', 1), ('farehila', 1), ('1yadahyawdawto', 1)]...
2018-07-21 22:06:22,898 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6320000 (=100.0%) documents
2018-07-21 22:06:25,211 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:06:25,244 : INFO : adding document #6320000 to Dictio

2018-07-21 22:07:16,582 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:07:16,616 : INFO : adding document #6430000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:07:18,688 : INFO : discarding 3413 tokens: [('12225ewa6', 1), ('chequecredit', 1), ('antiobam', 1), ('hansonvia', 1), ('aironc', 1), ('heardthrough', 1), ('sphygmomamomet', 1), ('slowsincreas', 1), ('httpenwikipediaorgwikicalifornialegislatur', 1), ('irishkyan', 1)]...
2018-07-21 22:07:18,689 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6440000 (=100.0%) documents
2018-07-21 22:07:20,978 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:07:21,011 : INFO : adding document #6440000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:07:2

2018-07-21 22:08:14,200 : INFO : discarding 3520 tokens: [('drawswithdraw', 1), ('18003814519', 1), ('9621440', 1), ('supportservicesokmedicalboardorg', 1), ('4829020', 1), ('20g100g', 1), ('gothamxi', 1), ('000089', 1), ('0001095', 1), ('nsm2', 1)]...
2018-07-21 22:08:14,201 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6560000 (=100.0%) documents
2018-07-21 22:08:16,500 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:08:16,533 : INFO : adding document #6560000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:08:18,670 : INFO : discarding 3522 tokens: [('scanssom', 1), ('formingook', 1), ('stonesther', 1), ('guisher', 1), ('hyunjoo', 1), ('jiwoo', 1), ('kwangho', 1), ('pilsuk', 1), ('seungwoo', 1), ('feltrinelli', 1)]...
2018-07-21 22:08:18,670 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6

2018-07-21 22:09:16,772 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6680000 (=100.0%) documents
2018-07-21 22:09:19,098 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:09:19,149 : INFO : adding document #6680000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:09:21,291 : INFO : discarding 3880 tokens: [('germinatingak', 1), ('smokesubscrib', 1), ('weekendt', 1), ('meattri', 1), ('tasteak', 1), ('organismsvalonia', 1), ('aidestyp', 1), ('minorsar', 1), ('aappam', 1), ('hereorsum', 1)]...
2018-07-21 22:09:21,292 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6690000 (=100.0%) documents
2018-07-21 22:09:23,748 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:09:23,782 : INFO : adding document #6690000 to Dictionary

2018-07-21 22:10:18,082 : INFO : adding document #6800000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:10:20,397 : INFO : discarding 3748 tokens: [('odrwv', 1), ('sponaugl', 1), ('percedpleas', 1), ('caddcam', 1), ('pyrgophoru', 1), ('somatogyru', 1), ('tryonia', 1), ('echinoidssea', 1), ('turritella', 1), ('tylostoma', 1)]...
2018-07-21 22:10:20,397 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6810000 (=100.0%) documents
2018-07-21 22:10:23,095 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:10:23,129 : INFO : adding document #6810000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:10:25,266 : INFO : discarding 3797 tokens: [('doubleblock', 1), ('fixationfus', 1), ('73580', 1), ('93342ank', 1), ('pay50468', 1), ('presidentpnc', 1), ('chicagobooth', 1), ('operationse

2018-07-21 22:11:18,449 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6930000 (=100.0%) documents
2018-07-21 22:11:20,777 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:11:20,810 : INFO : adding document #6930000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:11:23,025 : INFO : discarding 3409 tokens: [('knab', 1), ('ivorycolour', 1), ('vergano', 1), ('httpwwwbuzzlecomarticlesaerobicandanaerobicrespirationhtml', 1), ('3018e', 1), ('5958n', 1), ('249in', 1), ('6327mm', 1), ('rankki', 1), ('measuresit', 1)]...
2018-07-21 22:11:23,026 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 6940000 (=100.0%) documents
2018-07-21 22:11:25,396 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:11:25,430 : INFO : adding document 

2018-07-21 22:12:17,316 : INFO : adding document #7050000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:12:19,445 : INFO : discarding 3842 tokens: [('harpsichordur', 1), ('1130300', 1), ('aync', 1), ('vmcentric', 1), ('sume4', 1), ('createrecord', 1), ('au1971', 1), ('au58743', 1), ('leastau29696', 1), ('29431sourc', 1)]...
2018-07-21 22:12:19,445 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7060000 (=100.0%) documents
2018-07-21 22:12:21,765 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:12:21,800 : INFO : adding document #7060000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:12:24,024 : INFO : discarding 3380 tokens: [('portionsi', 1), ('softbak', 1), ('hx62', 1), ('measuressm', 1), ('preventiv', 1), ('httpswwwiidcindianaedupagesautismawarenessmonthafactsandtipsf

2018-07-21 22:13:14,184 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:13:14,220 : INFO : adding document #7170000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:13:16,318 : INFO : discarding 3516 tokens: [('canold', 1), ('compensation2', 1), ('practicehes', 1), ('straighthen', 1), ('cowskin', 1), ('264322', 1), ('364401425', 1), ('401425', 1), ('241325', 1), ('276345', 1)]...
2018-07-21 22:13:16,318 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7180000 (=100.0%) documents
2018-07-21 22:13:18,716 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:13:18,750 : INFO : adding document #7180000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:13:20,882 : INFO : discarding 3312 tokens: [('dustinguis

2018-07-21 22:14:14,720 : INFO : discarding 3000 tokens: [('llc55', 1), ('wwwsosalabamagov', 1), ('10a1911', 1), ('10a21403', 1), ('mcjohn', 1), ('breuler', 1), ('difusco', 1), ('tvzion', 1), ('s14e24', 1), ('s14e26', 1)]...
2018-07-21 22:14:14,721 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7300000 (=100.0%) documents
2018-07-21 22:14:17,041 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:14:17,076 : INFO : adding document #7300000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:14:19,240 : INFO : discarding 3001 tokens: [('brenintw', 1), ('cosmiccruis', 1), ('flights10917837', 1), ('pilotboi', 1), ('rjdxer', 1), ('woodentom', 1), ('kidnappinget', 1), ('2007oj', 1), ('wrongsimpson', 1), ('foreclosureno', 1)]...
2018-07-21 22:14:19,241 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7310000 

2018-07-21 22:15:09,143 : INFO : adding document #7410000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:15:11,780 : INFO : discarding 3332 tokens: [('handsethen', 1), ('5ore', 1), ('travelshad', 1), ('oafn', 1), ('vegetablescarbohydr', 1), ('ragazzasomeon', 1), ('tiful', 1), ('beena', 1), ('belba', 1), ('geoluhread', 1)]...
2018-07-21 22:15:11,781 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7420000 (=100.0%) documents
2018-07-21 22:15:14,936 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:15:14,978 : INFO : adding document #7420000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:15:18,098 : INFO : discarding 3321 tokens: [('011068', 1), ('6hp19', 1), ('squiddi', 1), ('iballisticsquidi', 1), ('peskey', 1), ('serviceverifi', 1), ('uspshelp', 1), ('112019625', 1), ('sor

2018-07-21 22:16:09,562 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7540000 (=100.0%) documents
2018-07-21 22:16:11,919 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:16:11,955 : INFO : adding document #7540000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:16:14,072 : INFO : discarding 3310 tokens: [('crunchesa', 1), ('hormoneshich', 1), ('caloriesadipos', 1), ('middlehealthi', 1), ('startsut', 1), ('kitchenos', 1), ('gymhich', 1), ('throwperform', 1), ('absthey', 1), ('workhich', 1)]...
2018-07-21 22:16:14,073 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7550000 (=100.0%) documents
2018-07-21 22:16:16,404 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:16:16,439 : INFO : adding document #7550000 to Dictio

2018-07-21 22:17:08,367 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:17:08,402 : INFO : adding document #7660000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:17:10,737 : INFO : discarding 3380 tokens: [('hetamin', 1), ('medshat', 1), ('sugarcarbos', 1), ('aldosteronese', 1), ('spironolactoneaminouracil', 1), ('mesodermertain', 1), ('rootepitheli', 1), ('radicleth', 1), ('reproductionherefor', 1), ('systemibr', 1)]...
2018-07-21 22:17:10,738 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7670000 (=100.0%) documents
2018-07-21 22:17:13,137 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:17:13,172 : INFO : adding document #7670000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:17:15,318 :

2018-07-21 22:18:08,079 : INFO : discarding 3317 tokens: [('bidplanroom', 1), ('emailemailcarrabbascom', 1), ('clubwear', 1), ('eunhe', 1), ('klpga', 1), ('untilwilson', 1), ('womensprofession', 1), ('worldgolfcomgolf', 1), ('wpga', 1), ('youthgolf', 1)]...
2018-07-21 22:18:08,080 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7790000 (=100.0%) documents
2018-07-21 22:18:10,430 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:18:10,469 : INFO : adding document #7790000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:18:12,595 : INFO : discarding 3306 tokens: [('66015', 1), ('67813', 1), ('69972', 1), ('acario', 1), ('zacharyah', 1), ('zackri', 1), ('6741it', 1), ('islandmor', 1), ('refugeshank', 1), ('379it', 1)]...
2018-07-21 22:18:12,596 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7800000 

2018-07-21 22:19:04,962 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7910000 (=100.0%) documents
2018-07-21 22:19:07,342 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:19:07,377 : INFO : adding document #7910000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:19:09,588 : INFO : discarding 3363 tokens: [('pollutionsmog', 1), ('oceanhug', 1), ('walletsbecaus', 1), ('shelfsaf', 1), ('feesirst', 1), ('privateit', 1), ('gamblet', 1), ('clienthes', 1), ('disputedsom', 1), ('hearingft', 1)]...
2018-07-21 22:19:09,589 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 7920000 (=100.0%) documents
2018-07-21 22:19:11,930 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:19:11,966 : INFO : adding document #7920000 to Dictionary

2018-07-21 22:20:05,318 : INFO : adding document #8030000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:20:07,493 : INFO : discarding 2812 tokens: [('capitafar', 1), ('countriesnorway', 1), ('3353a', 1), ('3540a', 1), ('4079e', 1), ('capitac', 1), ('scorpiof', 1), ('specimenarnet', 1), ('typeemston', 1), ('monthompar', 1)]...
2018-07-21 22:20:07,494 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8040000 (=100.0%) documents
2018-07-21 22:20:09,848 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:20:09,884 : INFO : adding document #8040000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:20:12,050 : INFO : discarding 2736 tokens: [('45euro', 1), ('pinkutk', 1), ('embosom', 1), ('dohann', 1), ('dnoua', 1), ('douaner', 1), ('douanier', 1), ('petour', 1), ('msgclassic', 1), ('

2018-07-21 22:21:04,342 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8160000 (=100.0%) documents
2018-07-21 22:21:06,695 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:21:06,731 : INFO : adding document #8160000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:21:08,889 : INFO : discarding 3003 tokens: [('813759', 1), ('liaringlass', 1), ('selfwalmart', 1), ('handsonhistori', 1), ('buildyourjacketcom', 1), ('reibrand', 1), ('871m', 1), ('vishvaraj', 1), ('ovivo', 1), ('455001', 1)]...
2018-07-21 22:21:08,890 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8170000 (=100.0%) documents
2018-07-21 22:21:11,254 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:21:11,289 : INFO : adding document #8170000 to Dictionary(20

2018-07-21 22:22:03,449 : INFO : adding document #8280000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:22:05,631 : INFO : discarding 3286 tokens: [('degalz', 1), ('6515015217976', 1), ('restricct', 1), ('p1d', 1), ('2050the', 1), ('10112669', 1), ('4222574475', 1), ('3411b1', 1), ('httpwwwdolgovdoltopicbenefitsleavefmlahtm', 1), ('minutesladl', 1)]...
2018-07-21 22:22:05,632 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8290000 (=100.0%) documents
2018-07-21 22:22:07,992 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:22:08,027 : INFO : adding document #8290000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:22:10,171 : INFO : discarding 3268 tokens: [('refinin', 1), ('jrfree', 1), ('wwwgalileebiblecamporg', 1), ('christfocus', 1), ('aaaah', 1), ('itemalbummount', 1),

2018-07-21 22:23:02,583 : INFO : discarding 3448 tokens: [('makemytripcomswitzerland', 1), ('davosklost', 1), ('4819190', 1), ('boltic', 1), ('sinivi', 1), ('oyema', 1), ('iochav', 1), ('httpwwwfuntriviacomaskftquestion31647htmlth', 1), ('httpblogstechnetcombnetworkingarchive20081209dhcpshowsthependingupdateiconevenafterthearecordisaddedtodnsaspx', 1), ('disrip', 1)]...
2018-07-21 22:23:02,584 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8410000 (=100.0%) documents
2018-07-21 22:23:04,984 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:23:05,030 : INFO : adding document #8410000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:23:07,215 : INFO : discarding 3340 tokens: [('paintcheck', 1), ('aspectswhen', 1), ('complicatedsom', 1), ('designhen', 1), ('ideasak', 1), ('professionalsnow', 1), ('budgetefurbish', 1), ('yourselfl

2018-07-21 22:23:58,420 : INFO : discarding 3305 tokens: [('diphoshph', 1), ('lubusz', 1), ('pietruszkaepa', 1), ('xhtmlhtml', 1), ('cuerpocuatro', 1), ('lengthhalf', 1), ('recorri', 1), ('legislationcirt', 1), ('memberstrust', 1), ('processaragraph', 1)]...
2018-07-21 22:23:58,421 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8530000 (=100.0%) documents
2018-07-21 22:24:02,110 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:24:02,173 : INFO : adding document #8530000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:24:05,417 : INFO : discarding 3068 tokens: [('mamaaugust', 1), ('httpwwwacronymfindercommilitaryandgovernmentscarhtml', 1), ('producepack', 1), ('aubarachnoid', 1), ('035040', 1), ('mlmin1', 1), ('systemsubarachnoid', 1), ('subaracchnoid', 1), ('10k12k14k18k', 1), ('7121988', 1)]...
2018-07-21 22:24:05,418 : INF

2018-07-21 22:24:55,915 : INFO : discarding 3312 tokens: [('electrodetrictli', 1), ('perpetratora', 1), ('2a2middl', 1), ('warshipul', 1), ('opticsreadi', 1), ('anticoagulantsacetaminophen', 1), ('threelength', 1), ('elucidateverb', 1), ('jnan', 1), ('nosqlfrom', 1)]...
2018-07-21 22:24:55,915 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8650000 (=100.0%) documents
2018-07-21 22:24:58,295 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:24:58,331 : INFO : adding document #8650000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:25:00,779 : INFO : discarding 3223 tokens: [('uscist', 1), ('visasoth', 1), ('abroadalidityexpir', 1), ('validity2', 1), ('14413906', 1), ('clearbuy', 1), ('daysblast', 1), ('protectionarg', 1), ('shopvacuum', 1), ('cleanlinesso', 1)]...
2018-07-21 22:25:00,780 : INFO : keeping 2000000 tokens which w

2018-07-21 22:25:56,398 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8770000 (=100.0%) documents
2018-07-21 22:25:58,789 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:25:58,824 : INFO : adding document #8770000 to Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:26:01,525 : INFO : discarding 3039 tokens: [('fuyuanshui', 1), ('guoyou', 1), ('huadu', 1), ('huoxiu', 1), ('flippedlick', 1), ('autorotation6hen', 1), ('orientationnstruct', 1), ('backhelp', 1), ('questionheck', 1), ('automaticallyheck', 1)]...
2018-07-21 22:26:01,526 : INFO : keeping 2000000 tokens which were in no less than 0 and no more than 8780000 (=100.0%) documents
2018-07-21 22:26:04,396 : INFO : resulting dictionary: Dictionary(2000000 unique tokens: ['achiev', 'amid', 'atom', 'cloud', 'commun']...)
2018-07-21 22:26:04,435 : INFO : adding document #8780000

In [9]:
from gensim.models import TfidfModel
from gensim import corpora
from gensim.corpora import Dictionary

dictionary = corpora.Dictionary.load("C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.dict")

class MyCorpus(object):
    def __iter__(self):
        with open('C:\\Users\\mapyredd\\Documents\\marco\\data\\collection_cleaned.tsv') as fr:
            reader = csv.reader(fr, delimiter = "\t")
            for row in reader:
                # assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(row[1].lower().split())

tfidfmodel = TfidfModel(MyCorpus())

2018-07-22 15:54:56,511 : INFO : loading Dictionary object from C:\Users\mapyredd\Documents\marco\data\collection_cleaned.dict
2018-07-22 15:54:59,314 : INFO : loaded C:\Users\mapyredd\Documents\marco\data\collection_cleaned.dict
2018-07-22 15:54:59,329 : INFO : collecting document frequencies
2018-07-22 15:54:59,341 : INFO : PROGRESS: processing document #0
2018-07-22 15:55:00,370 : INFO : PROGRESS: processing document #10000
2018-07-22 15:55:01,817 : INFO : PROGRESS: processing document #20000
2018-07-22 15:55:02,965 : INFO : PROGRESS: processing document #30000
2018-07-22 15:55:04,041 : INFO : PROGRESS: processing document #40000
2018-07-22 15:55:05,070 : INFO : PROGRESS: processing document #50000
2018-07-22 15:55:05,988 : INFO : PROGRESS: processing document #60000
2018-07-22 15:55:07,246 : INFO : PROGRESS: processing document #70000
2018-07-22 15:55:08,517 : INFO : PROGRESS: processing document #80000
2018-07-22 15:55:09,143 : INFO : PROGRESS: processing document #90000
2018-07-2

2018-07-22 15:57:09,147 : INFO : PROGRESS: processing document #1120000
2018-07-22 15:57:09,531 : INFO : PROGRESS: processing document #1130000
2018-07-22 15:57:09,931 : INFO : PROGRESS: processing document #1140000
2018-07-22 15:57:10,349 : INFO : PROGRESS: processing document #1150000
2018-07-22 15:57:11,081 : INFO : PROGRESS: processing document #1160000
2018-07-22 15:57:11,464 : INFO : PROGRESS: processing document #1170000
2018-07-22 15:57:12,056 : INFO : PROGRESS: processing document #1180000
2018-07-22 15:57:13,271 : INFO : PROGRESS: processing document #1190000
2018-07-22 15:57:15,857 : INFO : PROGRESS: processing document #1200000
2018-07-22 15:57:16,284 : INFO : PROGRESS: processing document #1210000
2018-07-22 15:57:16,665 : INFO : PROGRESS: processing document #1220000
2018-07-22 15:57:17,866 : INFO : PROGRESS: processing document #1230000
2018-07-22 15:57:18,737 : INFO : PROGRESS: processing document #1240000
2018-07-22 15:57:19,569 : INFO : PROGRESS: processing document #

2018-07-22 15:58:49,359 : INFO : PROGRESS: processing document #2260000
2018-07-22 15:58:49,696 : INFO : PROGRESS: processing document #2270000
2018-07-22 15:58:50,054 : INFO : PROGRESS: processing document #2280000
2018-07-22 15:58:50,419 : INFO : PROGRESS: processing document #2290000
2018-07-22 15:58:50,853 : INFO : PROGRESS: processing document #2300000
2018-07-22 15:58:51,214 : INFO : PROGRESS: processing document #2310000
2018-07-22 15:58:51,575 : INFO : PROGRESS: processing document #2320000
2018-07-22 15:58:51,966 : INFO : PROGRESS: processing document #2330000
2018-07-22 15:58:52,374 : INFO : PROGRESS: processing document #2340000
2018-07-22 15:58:52,783 : INFO : PROGRESS: processing document #2350000
2018-07-22 15:58:53,236 : INFO : PROGRESS: processing document #2360000
2018-07-22 15:58:53,610 : INFO : PROGRESS: processing document #2370000
2018-07-22 15:58:54,014 : INFO : PROGRESS: processing document #2380000
2018-07-22 15:58:54,556 : INFO : PROGRESS: processing document #

2018-07-22 15:59:37,061 : INFO : PROGRESS: processing document #3400000
2018-07-22 15:59:37,444 : INFO : PROGRESS: processing document #3410000
2018-07-22 15:59:37,921 : INFO : PROGRESS: processing document #3420000
2018-07-22 15:59:38,420 : INFO : PROGRESS: processing document #3430000
2018-07-22 15:59:38,793 : INFO : PROGRESS: processing document #3440000
2018-07-22 15:59:39,155 : INFO : PROGRESS: processing document #3450000
2018-07-22 15:59:39,538 : INFO : PROGRESS: processing document #3460000
2018-07-22 15:59:39,908 : INFO : PROGRESS: processing document #3470000
2018-07-22 15:59:40,328 : INFO : PROGRESS: processing document #3480000
2018-07-22 15:59:40,707 : INFO : PROGRESS: processing document #3490000
2018-07-22 15:59:41,076 : INFO : PROGRESS: processing document #3500000
2018-07-22 15:59:41,505 : INFO : PROGRESS: processing document #3510000
2018-07-22 15:59:41,873 : INFO : PROGRESS: processing document #3520000
2018-07-22 15:59:42,241 : INFO : PROGRESS: processing document #

2018-07-22 16:00:28,727 : INFO : PROGRESS: processing document #4540000
2018-07-22 16:00:29,155 : INFO : PROGRESS: processing document #4550000
2018-07-22 16:00:29,553 : INFO : PROGRESS: processing document #4560000
2018-07-22 16:00:29,909 : INFO : PROGRESS: processing document #4570000
2018-07-22 16:00:30,322 : INFO : PROGRESS: processing document #4580000
2018-07-22 16:00:30,751 : INFO : PROGRESS: processing document #4590000
2018-07-22 16:00:31,125 : INFO : PROGRESS: processing document #4600000
2018-07-22 16:00:31,538 : INFO : PROGRESS: processing document #4610000
2018-07-22 16:00:31,951 : INFO : PROGRESS: processing document #4620000
2018-07-22 16:00:32,346 : INFO : PROGRESS: processing document #4630000
2018-07-22 16:00:32,706 : INFO : PROGRESS: processing document #4640000
2018-07-22 16:00:33,122 : INFO : PROGRESS: processing document #4650000
2018-07-22 16:00:33,510 : INFO : PROGRESS: processing document #4660000
2018-07-22 16:00:33,906 : INFO : PROGRESS: processing document #

2018-07-22 16:01:23,537 : INFO : PROGRESS: processing document #5680000
2018-07-22 16:01:23,922 : INFO : PROGRESS: processing document #5690000
2018-07-22 16:01:24,336 : INFO : PROGRESS: processing document #5700000
2018-07-22 16:01:24,815 : INFO : PROGRESS: processing document #5710000
2018-07-22 16:01:25,194 : INFO : PROGRESS: processing document #5720000
2018-07-22 16:01:25,583 : INFO : PROGRESS: processing document #5730000
2018-07-22 16:01:25,955 : INFO : PROGRESS: processing document #5740000
2018-07-22 16:01:26,334 : INFO : PROGRESS: processing document #5750000
2018-07-22 16:01:26,736 : INFO : PROGRESS: processing document #5760000
2018-07-22 16:01:27,211 : INFO : PROGRESS: processing document #5770000
2018-07-22 16:01:27,620 : INFO : PROGRESS: processing document #5780000
2018-07-22 16:01:28,050 : INFO : PROGRESS: processing document #5790000
2018-07-22 16:01:28,485 : INFO : PROGRESS: processing document #5800000
2018-07-22 16:01:28,867 : INFO : PROGRESS: processing document #

2018-07-22 16:02:22,935 : INFO : PROGRESS: processing document #6820000
2018-07-22 16:02:23,631 : INFO : PROGRESS: processing document #6830000
2018-07-22 16:02:24,336 : INFO : PROGRESS: processing document #6840000
2018-07-22 16:02:25,048 : INFO : PROGRESS: processing document #6850000
2018-07-22 16:02:25,771 : INFO : PROGRESS: processing document #6860000
2018-07-22 16:02:26,428 : INFO : PROGRESS: processing document #6870000
2018-07-22 16:02:27,194 : INFO : PROGRESS: processing document #6880000
2018-07-22 16:02:27,737 : INFO : PROGRESS: processing document #6890000
2018-07-22 16:02:28,242 : INFO : PROGRESS: processing document #6900000
2018-07-22 16:02:28,796 : INFO : PROGRESS: processing document #6910000
2018-07-22 16:02:29,303 : INFO : PROGRESS: processing document #6920000
2018-07-22 16:02:29,784 : INFO : PROGRESS: processing document #6930000
2018-07-22 16:02:30,257 : INFO : PROGRESS: processing document #6940000
2018-07-22 16:02:30,733 : INFO : PROGRESS: processing document #

2018-07-22 16:03:17,635 : INFO : PROGRESS: processing document #7960000
2018-07-22 16:03:17,975 : INFO : PROGRESS: processing document #7970000
2018-07-22 16:03:18,340 : INFO : PROGRESS: processing document #7980000
2018-07-22 16:03:18,692 : INFO : PROGRESS: processing document #7990000
2018-07-22 16:03:19,090 : INFO : PROGRESS: processing document #8000000
2018-07-22 16:03:19,448 : INFO : PROGRESS: processing document #8010000
2018-07-22 16:03:19,807 : INFO : PROGRESS: processing document #8020000
2018-07-22 16:03:20,146 : INFO : PROGRESS: processing document #8030000
2018-07-22 16:03:20,496 : INFO : PROGRESS: processing document #8040000
2018-07-22 16:03:20,841 : INFO : PROGRESS: processing document #8050000
2018-07-22 16:03:21,179 : INFO : PROGRESS: processing document #8060000
2018-07-22 16:03:21,527 : INFO : PROGRESS: processing document #8070000
2018-07-22 16:03:21,862 : INFO : PROGRESS: processing document #8080000
2018-07-22 16:03:22,197 : INFO : PROGRESS: processing document #

In [10]:
def getIdfFeature():
    # key: (qid, docid, rel)
    # value: idf -> contains sum, max, min
    
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)

        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    if dictionary.token2id.get(query_term) != None:
                        term_id = dictionary.token2id[query_term]
                        score_list.append(tfidfmodel.idfs[term_id])
                    else:
                        score_list.append(0)
                else:
                    score_list.append(0)
                    
            sum_idf = sum(score_list)
            max_idf = max(score_list)
            min_idf = min(score_list)

            scores[(qid, docid, 1)] = (sum_idf, max_idf, min_idf)
            
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)

        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    if dictionary.token2id.get(query_term) != None:
                        term_id = dictionary.token2id[query_term]
                        score_list.append(tfidfmodel.idfs[term_id])
                    else:
                        score_list.append(0)
                else:
                    score_list.append(0)
                    
            sum_idf = sum(score_list)
            max_idf = max(score_list)
            min_idf = min(score_list)

            scores[(qid, docid, 0)] = (sum_idf, max_idf, min_idf)
            
    return scores

In [11]:
def getTfFeature():
    # key: (qid, docid, rel)
    # value: tf -> contains sum, max, min, mean
    
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)          
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    score_list.append(doctext.count(query_term))
                else:
                    score_list.append(0)

            sum_tf = sum(score_list)
            max_tf = max(score_list)
            min_tf = min(score_list)
            scores[(qid, docid, 1)] = (sum_tf, max_tf, min_tf)
            
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)          
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)
            for query_term in query_list:
                doctext = collections[docid].split()
                if query_term in doctext:
                    score_list.append(doctext.count(query_term))
                else:
                    score_list.append(0)

            sum_tf = sum(score_list)
            max_tf = max(score_list)
            min_tf = min(score_list)
            scores[(qid, docid, 0)] = (sum_tf, max_tf, min_tf)
    return scores

In [12]:
def getTfIdfFeature():
    # key: (qid, docid, rel)
    # value: tf -> contains sum, max, min, mean
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0.)              
            doctext = collections[docid].lower().split()
            doc_vector = tfidfmodel[dictionary.doc2bow(doctext)]            
            for query_term in query_list:
                if dictionary.token2id.get(query_term) != None:
                    s = [t[1] for t in doc_vector if t[0] == dictionary.token2id.get(query_term)]
                    if len(s) > 0:
                        score_list.append(s[0])
                    else:
                        score_list.append(0.)
                else:
                    score_list.append(0.)
            
            try:
                sum_tfidf = sum(score_list)
                max_tfidf = max(score_list)
                min_tfidf = min(score_list)
                mean_tfidf = sum_tfidf/len(doctext)
            except:
                import pdb;pdb.set_trace()
                
            scores[(qid, docid, 1)] = (sum_tfidf, max_tfidf, min_tfidf,mean_tfidf)

    
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            score_list = []
            if len(query_list) == 0:
                score_list.append(0)               
            doctext = collections[docid].lower().split()
            doc_vector = tfidfmodel[dictionary.doc2bow(doctext)]            
            for query_term in query_list:
                if dictionary.token2id.get(query_term) != None:
                    s = [t[1] for t in doc_vector if t[0] == dictionary.token2id.get(query_term)]
                    if len(s) > 0:
                        score_list.append(s[0])
                    else:
                        score_list.append(0.)
                else:
                    score_list.append(0.)
            
            sum_tfidf = sum(score_list)
            max_tfidf = max(score_list)
            min_tfidf = min(score_list)
            mean_tfidf = sum_tfidf/len(doctext)
                
            scores[(qid, docid, 0)] = (sum_tfidf, max_tfidf, min_tfidf,mean_tfidf)
    return scores

In [11]:
def getQueryLengthFeature():
    # key: (qid, docid, rel)
    # value: query length
    scores = defaultdict(tuple)
    for qid, docid_list in qrels.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            scores[(qid, docid, 1)] = len(query_list)

    
    for qid, docid_list in qrels_nr.items():
        qtext = query_text[qid]
        query_list = preprocess(qtext)
        for docid in docid_list:
            scores[(qid, docid, 0)] = len(query_list)
    return scores

In [11]:
import pickle

# Combining all the features

# bm25_scores = getBM25Feature()
# pickle.dump( bm25_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\bm25_features_dev.p", "wb" ) )

# doclen_scores = getDocLengthFeature()
# pickle.dump( doclen_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\doclen_features_dev.p", "wb" ) )

# coverage_scores = getCoverageFeature()
# pickle.dump( coverage_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\coverage_features_dev.p", "wb" ) )

# idf_scores = getIdfFeature()
# pickle.dump( idf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\idf_features_dev.p", "wb" ) )

# tf_scores = getTfFeature()
# pickle.dump( tf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tf_features_dev.p", "wb" ) )

# tfidf_scores = getTfIdfFeature()
# pickle.dump( tfidf_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tfidf_features_dev.p", "wb" ) )

# querylen_scores = getQueryLengthFeature()
# pickle.dump( querylen_scores, open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\querylen_features_dev.p", "wb" ))

and_scores = getAndFeature()
pickle.dump(and_scores, open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\and_features_dev.p", "wb"))

and2_scores = getAnd2Feature()
pickle.dump(and2_scores, open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\and2_features_dev.p", "wb"))

In [None]:
import pickle
bm25 = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\bm25_features_dev.p", "rb" ))
len(bm25)

In [4]:
doclen = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\doclen_features_dev.p", "rb" ))
len(doclen)

111397

In [5]:
coverage = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\coverage_features_dev.p", "rb" ))
len(coverage)

111397

In [6]:
idf = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\idf_features_dev.p", "rb" ))
len(idf)

111397

In [7]:
tf = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tf_features_dev.p", "rb" ))
len(tf)

111397

In [8]:
tfidf = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\tfidf_features_dev.p", "rb" ))
len(tfidf)

111397

In [20]:
querylen = pickle.load(open( "C:\\Users\\mapyredd\\Documents\\marco\\data\\LTR\\querylen_features_dev.p", "rb" ))
len(querylen)

111397

In [8]:
querylen = pickle.load(open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\and_features_dev.p", "rb"))
len(querylen)

111397

In [12]:
querylen = pickle.load(open("C:\\Users\\sundaras\\AnacondaProjects\\OneML\\data\\and2_features_dev.p", "rb"))
len(querylen)

111397