In [1]:
import os
import operator
import codecs

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as sw

from Normalizer import Normalizer

from Parser import ParsedDocumentFormat, filter_text, normalize_text

In [2]:
normalizer = Normalizer()

In [3]:
DATA_PATH = "../data2/"
folders = os.listdir(DATA_PATH)

In [4]:
docs_parsed_paths = dict()

for folder_i, folder_name in enumerate(folders):
    path  = DATA_PATH + folder_name + '/'
    files = os.listdir(path)
    
    for file_i, file_name in enumerate(files):
        docs_parsed_paths[int(file_name)] = path + file_name

In [5]:
sample = pd.read_csv("../sample.csv")
sample.head()

Unnamed: 0,QueryId,DocumentId
0,1,222
1,1,244
2,1,842
3,1,851
4,1,1226


In [6]:
sample = sample.groupby("QueryId")

In [7]:
with codecs.open("../queries.numerate.fixed.txt", mode="r", encoding="utf-8") as f_name:
    queries = dict()
    
    for pair in f_name:
        pair = pair.strip().split('\t')
        if len(pair) == 2:
            pair[0] = int(pair[0])
            pair[1] = normalize_text(normalizer, filter_text(pair[1])).strip().split()
            queries[pair[0]] = pair[1]

In [8]:
def get_group_docs_parsed(group_docs):
    group_docs = group_docs.DocumentId.values
    group_docs_parsed = {
        "index":       [],
        "title":       [],
        "description": [],
        "keywords":    [],
        "text":        []
    }
    
    for doc in group_docs:
        if doc not in docs_parsed_paths:
            continue
            
        file_name = docs_parsed_paths[doc]
        doc_parsed = ParsedDocumentFormat.load(file_name)
        
        for key in group_docs_parsed.keys():
            group_docs_parsed[key].append(doc_parsed[key])
    
    return group_docs_parsed

In [9]:
class TfIdfRanker:
    def __init__(self, key, norm='l2'):
        stopwords = map(normalizer.get_normal_form, sw.words('russian'))
        
        self.vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=int(10e6), norm=norm)
        self.key = key
        
    def rank(self, query, docs):
        group_docs_parsed = get_group_docs_parsed(docs)
        
        n_docs  = len(group_docs_parsed["index"])
        n_query = len(query)
        scores = np.zeros(shape=(n_docs, n_query), dtype=float)
        
        matrix = self.vectorizer.fit_transform(group_docs_parsed[self.key])
        features = {word: i for i, word in enumerate(self.vectorizer.get_feature_names())}
        
        for word_i, word in enumerate(query):
            if word in features:
                scores[:, word_i] = matrix[:, features[word]].toarray().reshape(-1)
        
        scores = scores.sum(axis=1)
        
        result = zip(group_docs_parsed["index"], scores)
        # result = sorted(result, key=operator.itemgetter(1), reverse=True)
        
        return result

In [10]:
keys_set = ["title", "description", "keywords", "text"]

scores_by_query = dict()

for query_i, query in queries.iteritems():
    scores = dict()
    
    for key in keys_set:        
        ranker = TfIdfRanker(key=key, norm='l2')
        
        docs = sample.get_group(query_i)
        score = ranker.rank(query, docs)
        
        for pair in score:
            if pair[0] not in scores:
                scores[pair[0]] = []
            scores[pair[0]].append(pair[1])
    
    for doc_i, value in scores.items():
        scores[doc_i] = np.asarray(value)
    
    scores_by_query[query_i] = scores
        
    print u"\r{} of {} processed...".format(query_i, len(queries)),

339 of 339 processed...


In [17]:
weights = np.asarray([1.0, 0.5, 0.5, 2.0], dtype=float)
zip(keys_set, weights)

[('title', 1.0), ('description', 0.5), ('keywords', 0.5), ('text', 2.0)]

In [18]:
def comparator(this, that):
    result = int(this[0] - that[0])
    if result:
        return result
    else:
        result = -(this[2] - that[2])
        if result > 0:
            return 1
        elif result < 0:
            return -1
        else:
            return 0

In [19]:
scores = []

for query_i, query_sc in scores_by_query.iteritems():
    score = [(query_i, doc_i, (values * weights).sum()) for doc_i, values in query_sc.iteritems()]
    score = sorted(score, cmp=comparator)[:10]
    scores.extend(score)

In [20]:
scores[:30]

[(1, 20648, 3.4611142456980808),
 (1, 25164, 3.4295632755321606),
 (1, 1226, 3.4151675283374394),
 (1, 19604, 3.1302664656452683),
 (1, 11741, 3.1302664656452683),
 (1, 25371, 3.0254164703318556),
 (1, 4845, 2.901741603431546),
 (1, 25333, 2.8324324906201976),
 (1, 13383, 2.6463599454617732),
 (1, 10149, 2.5072547484048897),
 (2, 2119, 3.0651560901323482),
 (2, 26413, 1.7996953409982912),
 (2, 25053, 1.7834559258002294),
 (2, 3054, 1.6983526745729882),
 (2, 10661, 1.664023270708586),
 (2, 2108, 1.5870023404807927),
 (2, 17932, 1.3192257349092178),
 (2, 2687, 1.2708758338993302),
 (2, 25103, 1.1732057205823534),
 (2, 22724, 1.1513232664116273),
 (3, 12698, 2.5346664619548216),
 (3, 25358, 2.3709796459047814),
 (3, 450, 2.2058724116280581),
 (3, 449, 1.9257640383426646),
 (3, 23211, 1.1073697057490439),
 (3, 17655, 0.90511143656678761),
 (3, 22870, 0.7908204546575297),
 (3, 6979, 0.6523081458572586),
 (3, 11534, 0.64830709237987716),
 (3, 15589, 0.63131897139775672)]

In [21]:
answer = pd.DataFrame(data=scores, columns=["QueryId", "DocumentId", "score"])
answer = answer[answer.columns[:-1]]
answer.head()

Unnamed: 0,QueryId,DocumentId
0,1,20648
1,1,25164
2,1,1226
3,1,19604
4,1,11741


In [22]:
answer.to_csv("../submission_7.csv", sep=',', index=False)