In [213]:
import json
import requests

class JsonDict(dict):
    """general json object that allows attributes to be bound to and also behaves like a dict"""

    def __getattr__(self, attr):
        try:
            return self[attr]
        except KeyError:
            raise AttributeError(r"'JsonDict' object has no attribute '%s'" % attr)

    def __setattr__(self, attr, value):
        self[attr] = value

def parse_json(json_str):
    """parse str into JsonDict"""

    def _obj_hook(pairs):
        """convert json object to python object"""
        o = JsonDict()
        for k, v in pairs.items():
            o[unicode(k)] = v
        return o

    return json.loads(json_str, object_hook=_obj_hook)

class ElasticSearch(dict):
    timeout = 2

    def __init__(self, base_url='http://100.65.6.44:8080', auth=('es', 'Elastic2@user')):
        self.base_url = base_url
        self.auth = auth

    def _search(self, index, doc_type, query_dsl):
        url = "%s/%s/%s/_search?pretty" % (self.base_url, index, doc_type)

        r = requests.post(url, data=query_dsl, auth=self.auth, timeout=self.timeout)
        if not r.status_code in [200]:
            raise Exception("[HTTP %d] %s\n%s" % (r.status_code, r.text, r.url))

        try:
            json_results = parse_json(r.text)
            return json_results.hits.hits

        except Exception, e:
            raise Exception("ElasticSearch() response failed: %s\n%s" % (e, r.text))

    def _doc(self, index, doc_type, doc_id):
        url = "%s/%s/%s/%s?pretty" % (self.base_url, index, doc_type, doc_id)

        r = requests.get(url, auth=self.auth, timeout=self.timeout)
        if not r.status_code in [200]:
            raise Exception("[HTTP %d] %s\n%s" % (r.status_code, r.text, r.url))

        try:
            json_results = parse_json(r.text)
            return json_results._source

        except Exception, e:
            raise Exception("ElasticSearch() response failed: %s\n%s" % (e, r.text))

    def _analyze(self, index, text, analyzer='ik_smart'):
        url = "%s/%s/_analyze?pretty" % (self.base_url, index)
        
        params = {
            'text': text,
            'analyzer': analyzer,
        }

        r = requests.get(url, params=params, auth=self.auth, timeout=self.timeout)
        if not r.status_code in [200]:
            raise Exception("[HTTP %d] %s\n%s" % (r.status_code, r.text, r.url))

        try:
            json_results = parse_json(r.text)
            return json_results.tokens

        except Exception, e:
            raise Exception("ElasticSearch() response failed: %s\n%s" % (e, r.text))

    def _termvectors(self, index, doc_type, doc_id, fields=['title'], field_statistics=True, term_statistics=True):
        url = "%s/%s/%s/%s/_termvectors?pretty" % (self.base_url, index, doc_type, doc_id)

        params = {
            'fields': ','.join(fields),
            'field_statistics': field_statistics,
            'term_statistics': term_statistics,
            'offsets': True,
            'payloads': True,
            'positions': True,
        }

        r = requests.get(url, params=params, auth=self.auth, timeout=self.timeout)
        if not r.status_code in [200]:
            raise Exception("[HTTP %d] %s\n%s" % (r.status_code, r.text, r.url))

        try:
            json_results = parse_json(r.text)
            return json_results.term_vectors

        except Exception, e:
            raise Exception("ElasticSearch() response failed: %s\n%s" % (e, r.text))

# 查询doc的分词和TF

In [214]:
import math

def get_doc_terms(index, doc_id, field='title'):
    if index == 'media_search':
        es = ElasticSearch('http://100.66.1.11:8080')
    else:
        es = ElasticSearch('http://100.65.6.44:8080')

    termvectors = es._termvectors(index, 'doc', doc_id)
    tmp_vectors = [ (v.tokens[0].position, t, v.doc_freq) for t,v in termvectors[field].terms.items() ]
    max_doc_count = termvectors[field].field_statistics.doc_count
    return [ v[1:] for v in sorted(tmp_vectors) ], max_doc_count

doc_list = [
    ('media_search','14176543619570405528'),
    ('media_search','12994864364961195086'),
    ('media_search','11217959712449608799'),
]

d = 0.8
dt = []
for index,doc_id in doc_list:
    termvectors, max_doc_count = get_doc_terms(index, doc_id)
    dt.append([ v[0] for v in termvectors ])
    for term,tf in termvectors:
        popular = math.log(1 + (max_doc_count - d*tf + 0.5) / (d*tf + 0.5))
        print "%s/(%d,%.4f)" % (term, tf, popular),
    print ""

当/(145,7.1972) 幸福/(345,6.3329) 来/(328,6.3834) 敲门/(25,8.9347) 
当/(148,7.1744) 真爱/(86,7.7143) 来/(309,6.4405) 敲门/(26,8.8940) 
幸福/(345,6.3329) 来/(328,6.3834) 敲门/(25,8.9347) 2017/(666,5.6760) 


# 计算query的匹配分

In [215]:
def get_query_terms(query):
    es = ElasticSearch('http://100.66.1.11:8080')
    tokens = es._analyze('media_search', query)
    return [ t.token for t in tokens ]

query = u"当幸福来敲门"
qt = get_query_terms(query)
for term in qt:
    print term, 

当 幸福 来 敲门


In [216]:
def get_matched_terms(doc_terms, query_terms):
    result = []
    for term in query_terms:
        if term in doc_terms:
            result.append(term)
    return result

def calc_matched_score(query_terms, matched_terms, term_weights={}):
    weights = 0.0
    for term in matched_terms:
        weights += term_weights.get(term, 1.0)
    coord = weights * len(matched_terms) / len(query_terms)
    return coord

# coord是匹配程度分值，和doc的terms个数相同则表示全匹配
mt = [ [] for i in range(len(doc_list)) ]
coord = [ 0 for i in range(len(doc_list)) ]
for idx,(_,doc_id) in enumerate(doc_list):
    mt[idx] = get_matched_terms(dt[idx], qt)
    coord[idx] = calc_matched_score(qt, mt[idx])
    print ' '.join(mt[idx]), ">>>", coord[idx], coord[idx] == len(dt[idx])

当 幸福 来 敲门 >>> 4.0 True
当 来 敲门 >>> 2.25 False
幸福 来 敲门 >>> 2.25 False


# 计算正向匹配的最大term数

In [247]:
doc_list = [
    ('mugc_search','940238209026957285'),
    # ('mugc_search','15709473540772892745'),
    # ('mugc_search','18108452449665614989'),
    ('mugc_search','4009975089580785448'),
]

d = 0.8
dt = []
for index,doc_id in doc_list:
    termvectors, max_doc_count = get_doc_terms(index, doc_id)
    dt.append([ v[0] for v in termvectors ])
    for term,tf in termvectors:
        popular = math.log(1 + (max_doc_count - d*tf + 0.5) / (d*tf + 0.5))
        print "%s/(%d,%.4f)" % (term, tf, popular),
    print ""

# coord是匹配程度分值，和doc的terms个数相同则表示全匹配
mt = [ [] for i in range(len(doc_list)) ]
coord = [ 0 for i in range(len(doc_list)) ]
for idx,(_,doc_id) in enumerate(doc_list):
    mt[idx] = get_matched_terms(dt[idx], qt)
    coord[idx] = calc_matched_score(qt, mt[idx])
    print ' '.join(mt[idx]), ">>>", coord[idx], coord[idx] == len(dt[idx])

三生/(2786,8.2006) 三世/(2007,8.5285) 爆笑/(24881,6.0113) 花絮/(28915,5.8611) 杨/(35697,5.6504) 幂/(5964,7.4396) 一言不合/(1816,8.6285) 飙车/(1918,8.5738) 赵/(23899,6.0516) 廷/(2047,8.5088) 害羞/(1129,9.1036) 
三生/(2864,8.1693) 三世/(2065,8.4963) 十里/(1622,8.7377) 桃花/(2783,8.1980) 花絮/(29023,5.8537) 赵/(23363,6.0706) 廷/(2087,8.4857) 爆笑/(24374,6.0282) 变/(19645,6.2439) 装/(14031,6.5805) 逗笑/(419,10.0902) 杨/(35305,5.6577) 幂/(6066,7.4190) 
爆笑 花絮 杨 幂 >>> 4.0 False
爆笑 花絮 杨 幂 >>> 4.0 False


In [248]:
def longest_increasing_subsequence(array):
    tmp_len = len(array);
    LIS = [1 for i in range(tmp_len)];
    for i in range(len(array)):
        j=0;
        while j <= i:
            if (array[i] > array[j]) and (LIS[i] < LIS[j]+1):
                LIS[i] = LIS[j]+1;
            j = j+1;
    return max(LIS);

def max_LIS(doc_terms, query_terms):
    seq_terms = [ (term, doc_terms.index(term)) for term in query_terms if term in doc_terms ]
    # 取出序号转换为求最长递增子序列(LIS)问题
    seq_list = [s[1] for s in seq_terms]
    return seq_list

query = u"爆笑花絮杨幂"
qt = get_query_terms(query)
for term in qt:
    print term, 
print ""

st = [ [] for i in range(len(doc_list)) ]
for idx in range(len(doc_list)):
    st[idx] = max_LIS(dt[idx], qt)
    print longest_increasing_subsequence(st[idx]), " ".join(dt[idx])

爆笑 花絮 杨 幂 
4 三生 三世 爆笑 花絮 杨 幂 一言不合 飙车 赵 廷 害羞
3 三生 三世 十里 桃花 花絮 赵 廷 爆笑 变 装 逗笑 杨 幂
