In [30]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import time

import base64
import xmltodict

In [31]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [32]:
documents_by_id = {}
es.indices.delete(index='myandex')
es.indices.create(index='myandex')

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'myandex'}

In [36]:
def processFile(i):
    prefix = '../byweb_for_course/byweb.'
    suffix = '.xml'
    filename = prefix + str(i) + suffix
    with open(filename, 'rb') as f:
        decoded = f.read().decode('cp1251')
        xmldict = xmltodict.parse(decoded)
        for doc in tqdm(xmldict['romip:dataset']['document']):
            try:
                docID = doc['docID']
                documents_by_id[docID] = {}
                url = base64.b64decode(doc['docURL']).decode('cp1251')
                content = base64.b64decode(doc['content']['#text']).decode('cp1251')
                documents_by_id[docID]['url'] = url
                documents_by_id[docID]['content'] = content
            except Exception as e:
                print(e)

In [37]:
for i in range(1): # FIX
    processFile(i)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [38]:
settings_final = {
    'mappings': {
        'properties': {
            'url': {
                'type': 'text'
            },
            'content': {
                'type': 'text',
                "analyzer": "my_custom_analyzer"
            }
        }
    },
    "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "type":      "custom", 
          "tokenizer": "standard",
          "char_filter": [
            "html_strip",
            "yont"
          ],
          "filter": [
            "lowercase",
            #"asciifolding",
            "russian_snow",
            "english_snow"
          ]
        }
      },
        'char_filter': {
                'yont': {
                    'type': 'mapping',
                    'mappings': [
                        'ё => е',
                        'Ё => Е'
                    ]
                }
            },
    'filter': {
            'stop_words': {
                'type': 'stop',
                'stopwords': [
                ]
            },
            'russian_snow': {
                'type': 'snowball',
                'language': 'russian'
            },
            'english_snow': {
                'type': 'snowball',
                'language': 'english'
            }
     }
    }
  }
}

In [39]:
def recreate_index():
    es.indices.delete(index='myandex')
    es.indices.create(index='myandex', body=settings_final)

In [40]:
recreate_index()

In [41]:
def check_analyzer(analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index='myandex', body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

In [42]:
analyzer = {
    'analyzer': 'my_custom_analyzer'
}

check_analyzer(analyzer, '<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"> bla bla русский countable текст Ёшкин кот')

['bla', 'bla', 'русск', 'countabl', 'текст', 'ешкин', 'кот']

In [43]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [44]:
def es_actions_generator():
    for doc_id, doc in tqdm(documents_by_id.items()):
        yield create_es_action('myandex', doc_id, doc)

In [45]:
start = time.time()
for ok, result in parallel_bulk(es, es_actions_generator(), queue_size=4, thread_count=4, chunk_size=1000):
    if not ok:
        print(result)
end = time.time()
print(f"Time on index creation: {time.strftime('%H:%M:%S.%l', time.gmtime(end - start))}")
print(f"In seconds: {end - start}")

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


Time on index creation: 00:00:01.12
In seconds: 1.4075696468353271


In [46]:
def search(query, *args):
    pretty_print_result(es.search(index='myandex', body=query, size=100), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20

def raw_search(query):
    search_result = es.search(index='myandex', body=query, size=100)['hits']
    return [(hit['_id'], hit['_score']) for hit in search_result['hits']]
    
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits'][:6]:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index='myandex', id=doc_id)['_source']

In [47]:
def get_query(query):
    return {
    'query': {
        'bool': {
            'should': {
                'match': {
                    'content': query
                }
            }
        }
    }
    }

q = get_query('<meta http-equiv="Content-Type" content="text/html; charset=windows-1251">')
search(q)
raw_search(q)

Total documents: 0


[]

In [48]:
def load_queries_and_relevance():
    relevance = defaultdict(dict)
    filename = '../relevant_table_2009.xml'
    with open(filename, 'rb') as f:
        xmldict = xmltodict.parse(f.read())
        for task in tqdm(xmldict['taskDocumentMatrix']['task']):
            task_rel = {}
            has_vital = False
            for doc in task['document']:
                if doc['@relevance'] == 'vital':
                    has_vital = True
                task_rel[doc['@id']] = doc['@relevance']
            if has_vital:
                relevance[task['@id']] = task_rel
    filename = '../web2008_adhoc.xml'
    with open(filename, 'rb') as f:
        xmldict = xmltodict.parse(f.read())
        for task in tqdm(xmldict['task-set']['task']):
            if task['@id'] in relevance:
                relevance[task['@id']]['querytext'] = task['querytext']
    return relevance

In [49]:
relevance = load_queries_and_relevance()

HBox(children=(IntProgress(value=0, max=547), HTML(value='')))




HBox(children=(IntProgress(value=0, max=29231), HTML(value='')))




In [50]:
import numpy as np


def get_number_of_correct_out_of_k(results, task_relevance, k):
    return sum([1 if res[0] in task_relevance and task_relevance[res[0]] == 'vital' else 0 for res in results[:k]])

def measure_performance():    
    Q = len(relevance)
    mrr = 0
    ndcg_mean = 0
    for task in relevance.keys():
        task_relevance = relevance[task]
        results = raw_search(get_query(task_relevance['querytext']))
        idx = get_rank_indicies(results)
        results = results[idx]
        reciprocal_rank = 0
        ndcg = 0
        for i, res in enumerate(results):
            rel_i = 0
            if res[0] in task_relevance and task_relevance[res[0]] == 'vital':
                reciprocal_rank = 1 / (i + 1)
                rel_i = 1
            ndcg += rel_i / (np.log2(i + 2))
        normalizer = 0
        for i in range(get_number_of_correct_out_of_k(results, task_relevance, 100)):
            normalizer += 1 / (np.log2(i + 2))
        ndcg_mean += ndcg / normalizer if normalizer != 0 else 0
        mrr += reciprocal_rank
    print(f"ndcg@100: {mrr / Q}")
    print(f"mrr@100: {ndcg_mean / Q}")

In [51]:
def get_rank_indicies(docs):
    #TODO
    return

In [79]:
#!/usr/bin/python

import sys
import os

from collections import defaultdict
import xmltodict
import json
from tqdm import tqdm


def get_doc_id_to_url():
    docs_id_url_file_name = "./pagerank.txt"
    id_to_url = defaultdict(str)

    with open(docs_id_url_file_name) as docs_id_url_file:
        for line in tqdm(docs_id_url_file):
            tokens = line.split(" ")
            id_to_url[tokens[0]] = tokens[1]

    return id_to_url


def get_all_docs_for_train():
    docs_dir = "../../lemmatized_titles_pr_len"
    print("Getting doc id to url...")
    id_to_url = get_doc_id_to_url()
    all_docs = defaultdict(dict)

    print("Walking in docs dir...")
    for _, _, files in os.walk(docs_dir):
        for doc_filename in tqdm(files):
            try:
                with open(docs_dir + "/" + doc_filename, encoding='utf-8') as doc_file:
                    doc_id = doc_filename[4:]
                    doc_url = id_to_url[doc_id]
                    doc = json.load(doc_file)

                    doc_dict = defaultdict(str)
                    doc_dict["id"] = doc_id
                    doc_dict["url"] = doc_url
                    doc_dict["title"] = doc["title"]
                    doc_dict["pagerank"] = doc["pagerank"]
                    doc_dict["urllen"] = doc["urllen"]
                    doc_dict["doclen"] = doc["doclen"]
                    doc_dict["content"] = doc["content"]

                    all_docs[doc_url] = doc_dict
            except Exception as e:
                print(e)

    return all_docs


def get_all_docs_for_test():
    docs_dir = "../../lemmatized_titles_pr_len"
    id_to_url = get_doc_id_to_url()
    all_docs = defaultdict(dict)

    for _, _, files in os.walk(docs_dir):
        for doc_filename in tqdm(files):
            try:
                with open(docs_dir + "/" + doc_filename, encoding='utf-8') as doc_file:
                    doc_id = doc_filename[4:]
                    doc_url = id_to_url[doc_id]
                    doc = json.load(doc_file)

                    doc_dict = defaultdict(str)
                    doc_dict["id"] = doc_id
                    doc_dict["url"] = doc_url
                    doc_dict["title"] = doc["title"]
                    doc_dict["pagerank"] = doc["pagerank"]
                    doc_dict["urllen"] = doc["urllen"]
                    doc_dict["doclen"] = doc["doclen"]
                    doc_dict["content"] = doc["content"]

                    all_docs[doc_id] = doc_dict
            except Exception as e:
                print(e)

    return all_docs


def get_all_queries():
    queries_filename = "./web2008_adhoc.xml"
    all_queries = defaultdict(tuple)

    with open(queries_filename, encoding='cp1251') as queries_file:
        xml_dict = xmltodict.parse(queries_file.read())
        for task in tqdm(xml_dict['task-set']['task']):
            all_queries[task['@id']] = (task['@id'][3:], task['querytext'])

    return all_queries


def get_train_query_doc_pairs():
    relevant_table_filename = "./or_relevant-minus_table.xml"

    print("Getting all docs...")
    all_docs = get_all_docs_for_train()
    print("Getting all queries...")
    all_queries = get_all_queries()
    query_doc_pairs = []

    print("Calculating features...")
    with open(relevant_table_filename) as table_file:
        xml_dict = xmltodict.parse(table_file.read())

        for query_dict in tqdm(xml_dict['taskDocumentMatrix']['task']):
            try:
                query_id = query_dict['@id']
                query = all_queries[query_id]

                for doc_dict in query_dict['document']:
                    doc_url = doc_dict['@id']
                    relevance_str = doc_dict['@relevance']
                    relevance = 1 if relevance_str == "vital" else 0
                    doc = all_docs[doc_url]
                    query_doc_pairs.append((query, doc, relevance))
            except Exception as e:
                print(e)

    return query_doc_pairs


def get_test_query_doc_pairs(doc_ids):
    relevant_table_filename = "./relevant_table_2009.xml"

    all_docs = get_all_docs_for_test()
    all_queries = get_all_queries()
    query_doc_pairs = []

    with open(relevant_table_filename) as table_file:
        xml_dict = xmltodict.parse(table_file.read())

        for query_dict in tqdm(xml_dict['taskDocumentMatrix']['task']):
            try:
                query_id = query_dict['@id']
                print(query_id)
                query = all_queries[query_id]

                for doc_dict in query_dict['document']:
                    doc_id = doc_dict['@id']
                    if doc_id not in doc_ids[query_id]:
                        continue
                    relevance_str = doc_dict['@relevance']
                    relevance = 1 if relevance_str == "vital" else 0
                    doc = all_docs[doc_id]
                    query_doc_pairs.append((query, doc, relevance))
            except Exception as e:
                print(e)

    return query_doc_pairs


def build_features(doc_ids):
    query_doc_pairs = []

    print("Getting query doc pairs...")
    dataset_type = "test"
    if dataset_type == "train":
        query_doc_pairs = get_train_query_doc_pairs()
    elif dataset_type == "test":
        query_doc_pairs = get_test_query_doc_pairs(doc_ids)

    out_filename = dataset_type + "_generated_features_elastic.txt"

    with open(out_filename, "w") as out_file:
        for query, doc, relevance in tqdm(query_doc_pairs):
            try:
                features = []
                # calculate new_feature_value
                # features.append(new_feature_value)
                features.append(len(query[1]))
                features.append(doc["urllen"])
                features.append(doc["doclen"])
                features.append(doc["pagerank"])

                out_file.write(str(relevance) + " ")
                out_file.write("quid:" + query[0] + " ")
                for i, feature in enumerate(features, start=1):
                    out_file.write(str(i) + ":" + str(feature) + " ")
                out_file.write("\n")
            except Exception as e:
                print("Exception: ")
                print(e)

In [82]:
def measure_performance_2():    
    Q = len(relevance)
    mrr = 0
    ndcg_mean = 0
    doc_ids = {}
    for task in relevance.keys():
        task_relevance = relevance[task]
        doc_ids[task] = [pair[0] for pair in raw_search(get_query(task_relevance['querytext']))][:100]
    build_features(doc_ids)

In [83]:
measure_performance_2()

75977it [00:00, 759767.10it/s]

Getting query doc pairs...


199202it [00:00, 747846.67it/s]
100%|██████████| 29231/29231 [00:00<00:00, 817695.98it/s]
100%|██████████| 547/547 [00:00<00:00, 9120.73it/s]
0it [00:00, ?it/s]

arw57797
arw53809
arw57272
arw53808
arw57796
arw53806
arw51174
arw53801
arw50554
arw49893
arw53534
arw49692
arw58427
arw56522
arw53743
arw52490
arw59004
arw58530
arw53207
arw50642
arw51183
arw50640
arw53202
arw51903
arw50548
arw52682
arw49983
arw57994
arw59134
arw53738
arw50925
arw58439
arw52920
arw52544
'arw52544'
arw53549
'arw53549'
arw53730
arw52924
arw51197
arw52672
arw56405
'arw56405'
arw56407
arw56202
arw52478
arw56409
arw58540
'arw58540'
arw50530
arw54686
arw53317
arw52379
arw52570
arw57298
arw58403
arw53926
arw53823
arw52912
arw52578
arw52373
'arw52373'
arw54689
arw54481
arw52678
arw53726
arw52472
'arw52472'
arw50122
'arw50122'
arw52470
arw50529
'arw50529'
arw52662
arw53913
arw49782
arw53029
arw57161
arw54690
arw53811
'arw53811'
arw52561
arw53610
arw53320
'arw53320'
arw54499
arw54498
arw54495
arw53718
arw50276
arw52568
arw56218
arw50280
arw51835
'arw51835'
arw53844
arw52592
arw50695
arw50892
arw53700
arw50384
arw50690
'arw50690'
arw49936
arw58766
arw53701
'arw53701'
arw50972
ar




In [85]:
raw_search(get_query("я"))

[('396447', 2.398802)]

In [100]:
from sklearn.feature_selection import mutual_info_classif
def print_feature_importance():
    train = './train_generated_features.txt'
    with open(train) as train_features:
        lines = train_features.readlines()
        X = np.zeros((len(lines), 4))
        y = np.zeros(len(lines))
        for i, line in enumerate(lines):
            parts = line.split("\n")[0].split(" ")
            X[i][0] = float(parts[2].split(":")[1])
            X[i][1] = float(parts[3].split(":")[1])
            X[i][2] = float(parts[4].split(":")[1])
            X[i][3] = float(parts[5].split(":")[1])
            y[i] = parts[0]
    print(mutual_info_classif(X, y))

In [101]:
print_feature_importance()

[ 0.0336692   0.00565359  0.02178088  0.04768679]
