In [2]:
from IPython.display import HTML, display, Markdown
import itertools
import tabulate
import json
import pandas as pd
import numpy as np
import fileinput

In [53]:
seed_filename = 'input_data/seeds/seed_concepts'
gold_filename = 'annotated-as-seed.json'
# seed_filename = 'input_data/seeds/xlore_seeds.txt'
topks = [100, 200, 500]
relevance_field_names = ['', '', '', '']
algorithms = ['tf_idf', 'pagerank', 'graph_prop', 'average_distance']
raw_evaluated_filenames = [
    f'processed_data/propagation_results/{a}_nf_result.json' for a in algorithms]
rerank_evaluated_filenames = [
    f'processed_data/rerank_results/{a}_nf_rerank_result.json' for a in algorithms]
clustering_evaluated_filenames = [
    f'processed_data/cluster_results/{a}_nf_cluster_result.json' for a in algorithms]
clustering_evaluated_filenames = [
    f'processed_data/cluster_results/{a}_nf_cluster_result.json' for a in algorithms]
xlink_evaluated_filenames = ['processed_data/xlink_results/baike_context']
from evaluation import load_json, evaluate, load_evaluated, merge_results
with open(seed_filename) as f:
    seeds = set([line.strip() for line in f])
gold = set(load_json(gold_filename, lambda x: x['name']))
gold = set(gold) - seeds
results = {}

def run_evaluate(evaluated_filenames):
    results = {}
    for (evaluated_filename, field_name), k in itertools.product(
            zip(evaluated_filenames, relevance_field_names),
            topks):
        predicted = load_evaluated(evaluated_filename, gold, seeds, field_name)
        result = evaluate(predicted, gold, k)
        results[(evaluated_filename, k)] = result
    results = merge_results(results)
    return results
    import collections
    results = collections.OrderedDict(results)
    
    y_order = list(map(lambda x: ''.join(x), itertools.product(['mAP@', 'p@'], map(str, topks))))
    results = {k: list(sorted(
        v.items(),
        key=lambda x: y_order.index(x[0]))) for k, v in results.items()}
    return results

def pandas_df_to_markdown_table(df):
    from IPython.display import Markdown, display
    fmt = ['---' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    display(Markdown(df_formatted.to_csv(sep="|", index=False)))

def show_case(filename, gold, seeds, n_columns=8, n_shown=32, n_start=0):
    predicted = load_evaluated(filename, gold, seeds)
    # predicted = load_evaluated('processed_data/xlore_results/xlore_expansion.json', gold, seeds, '')
    # predicted = load_evaluated('/data1/wyq/concept-expansion-snippet/tmp/baike_average_distance_nf_result.json', gold, seeds, '')
    # pdp = list(map(lambda x: f'<font color=blue><h2>**{x}**</h2></font>' if x in gold else f'<h2>{x}</h2>', predicted[:500]))
    pdp = list(map(lambda x: f'<font color=blue>**{x}**</font>' if x in gold else x, predicted[n_start:n_start+n_shown]))
    def pack(l):
        row = []
        for i, p in enumerate(l):
            i += 1 + n_start
            row.append(p)
            if i % n_columns == 0:
                yield list(map(str, range(i-n_columns, i)))
                yield row
                row = []
    pandas_df_to_markdown_table(pd.DataFrame(list(pack(pdp))))

In [100]:
pd.DataFrame.from_dict(run_evaluate(raw_evaluated_filenames))

Unnamed: 0,tf_idf,pagerank,graph_prop,average_distance
mAP@100,0.072228,0.012619,0.013567,0.2741
p@100,0.21,0.14,0.11,0.47
mAP@200,0.045866,0.017023,0.016716,0.199214
p@200,0.16,0.145,0.14,0.385
mAP@500,0.030162,0.015927,0.021165,0.110594
p@500,0.142,0.126,0.152,0.254


In [4]:
run_evaluate(rerank_evaluated_filenames)

{'tf_idf': {'mAP@100': 0.1072029399509533,
  'p@100': 0.31,
  'mAP@200': 0.07027217103150507,
  'p@200': 0.22,
  'mAP@500': 0.03762402350354539,
  'p@500': 0.144},
 'pagerank': {'mAP@100': 0.1329347913262355,
  'p@100': 0.33,
  'mAP@200': 0.07965611124507505,
  'p@200': 0.22,
  'mAP@500': 0.04131811533076112,
  'p@500': 0.142},
 'graph_prop': {'mAP@100': 0.286150182898988,
  'p@100': 0.48,
  'mAP@200': 0.1792996480197057,
  'p@200': 0.33,
  'mAP@500': 0.0838460314850487,
  'p@500': 0.184},
 'average_distance': {'mAP@100': 0.2207774012973763,
  'p@100': 0.41,
  'mAP@200': 0.15269420221010516,
  'p@200': 0.325,
  'mAP@500': 0.09077940817731159,
  'p@500': 0.24}}

In [51]:
pd.DataFrame.from_dict({k+'_rerank': v for k, v in run_evaluate(rerank_evaluated_filenames).items()})

Unnamed: 0,tf_idf_rerank,pagerank_rerank,graph_prop_rerank,average_distance_rerank
mAP@100,0.348924,0.314772,0.815018,0.628963
p@100,0.58,0.53,0.85,0.76
mAP@200,0.245946,0.232631,0.529913,0.435908
p@200,0.435,0.415,0.595,0.56
mAP@500,0.11606,0.113469,0.246153,0.229194
p@500,0.232,0.23,0.32,0.35


In [4]:
pd.DataFrame.from_dict({k+'_clustering': v for k, v in run_evaluate(clustering_evaluated_filenames).items()})

Unnamed: 0,tf_idf_clustering,pagerank_clustering,graph_prop_clustering,average_distance_clustering
mAP@100,0.891325,0.185312,0.708277,0.755868
p@100,0.9,0.42,0.77,0.81
mAP@200,0.666861,0.154414,0.457731,0.531812
p@200,0.715,0.365,0.535,0.615
mAP@500,0.302392,0.121055,0.244736,0.295524
p@500,0.36,0.318,0.35,0.418


In [102]:
pd.DataFrame.from_dict(run_evaluate(['processed_data/xlink_results/dsa_video_context']))

Unnamed: 0,dsa_video_context
mAP@100,0.258177
mAP@200,0.157677
mAP@500,0.077051
p@100,0.4
p@200,0.285
p@500,0.176


In [8]:
pd.DataFrame.from_dict(run_evaluate([
'/data1/wyq/concept-expansion-snippet/tmp/baike_graph_prop_nf_result.json']))

Unnamed: 0,tf_idf,pagerank,graph_prop,average_distance,baike_context,baike_graph_prop
mAP@100,0.891325,0.185312,0.708277,0.755868,0.148936,0.013567
p@100,0.9,0.42,0.77,0.81,0.34,0.11
mAP@200,0.666861,0.154414,0.457731,0.531812,0.11271,0.014875
p@200,0.715,0.365,0.535,0.615,0.285,0.13
mAP@500,0.302392,0.121055,0.244736,0.295524,0.068436,0.019391
p@500,0.36,0.318,0.35,0.418,0.21,0.146


In [7]:
pd.DataFrame.from_dict(run_evaluate(['processed_data/xlore_results/xlore_expansion.json']))

Unnamed: 0,xlore_expansion.json
mAP@100,0.182173
mAP@200,0.10273
mAP@500,0.045215
p@100,0.29
p@200,0.195
p@500,0.108


In [None]:
pd.DataFrame.from_dict(run_evaluate([
'/data1/wyq/concept-expansion-snippet/tmp/baike_average_distance_nf_result.json']))

In [36]:
context_file = 'dsa_video_context_'
algorithm = 'tf_idf_'
cross_validation = 'cluster_'
# seed_filename = 'input_data/seeds/xlore_seeds.txt'
seed_filename = 'input_data/seeds/seed_concepts'
with open(seed_filename) as f:
    seeds = set([line.strip() for line in f])
folder_dict = {'': 'propagation_results', 'cluster_': 'cluster_results', 'rerank_': 'rerank_results'}
filename_seed_part = 'xlore_seed_' if seed_filename.endswith('xlore_seeds.txt') else 'more_seed_'
context_file, algorithm, cross_validation, filename_seed_part = '', 'average_distance_', '', ''
predicted = load_evaluated(
    f'processed_data/{folder_dict[cross_validation]}/'
    f'{context_file}{algorithm}{filename_seed_part}nf_{cross_validation}result.json', gold, seeds)
predicted = load_evaluated('processed_data/xlore_results/xlore_expansion.json', gold, seeds, '')
# predicted = load_evaluated('processed_data/xlink_results/baike_context', gold, seeds, '')
# predicted = load_evaluated('/data1/wyq/concept-expansion-snippet/tmp/baike_average_distance_nf_result.json', gold, seeds, '')

n_columns = 8
# pdp = list(map(lambda x: f'<font color=blue><h2>**{x}**</h2></font>' if x in gold else f'<h2>{x}</h2>', predicted[:500]))
pdp = list(map(lambda x: f'<font color=blue>**{x}**</font>' if x in gold else x, predicted[:100]))
def pack(l):
    row = []
    for i, p in enumerate(l):
        i += 1
        row.append(p)
        if i % n_columns == 0:
            yield list(map(str, range(i-n_columns, i)))
            yield row
            row = []
pdp = list(pack(pdp))
# pdp = np.array(pdp).reshape(n_columns, -1)
def pandas_df_to_markdown_table(df):
    from IPython.display import Markdown, display
    fmt = ['---' for i in range(len(df.columns))]
    df_fmt = pd.DataFrame([fmt], columns=df.columns)
    df_formatted = pd.concat([df_fmt, df])
    display(Markdown(df_formatted.to_csv(sep="|", index=False)))
def pandas_df_to_html(df):
    from IPython.display import HTML, display
    df_fmt = pd.DataFrame([])

# n_rows = 10a
# n_columns = len(predicted) // n_rows + 1
pandas_df_to_markdown_table(pd.DataFrame(pdp))

0|1|2|3|4|5|6|7
---|---|---|---|---|---|---|---
0|1|2|3|4|5|6|7
<font color=blue>**异或链表**</font>|<font color=blue>**基数排序**</font>|<font color=blue>**树状数组**</font>|<font color=blue>**循环链表**</font>|<font color=blue>**拓扑排序**</font>|拓朴排序|矩阵列运算|<font color=blue>**单向链表**</font>
8|9|10|11|12|13|14|15
<font color=blue>**马尔可夫算法**</font>|矩阵运算|<font color=blue>**算法和数据结构**</font>|<font color=blue>**递归**</font>|<font color=blue>**最优二叉树**</font>|<font color=blue>**关联数组**</font>|图,数据结构|数据结构列表
16|17|18|19|20|21|22|23
可定义函数|图论傅立叶转换|亏格|<font color=blue>**哈夫曼编码**</font>|控制流|<font color=blue>**数组步长**</font>|<font color=blue>**快速排序算法**</font>|<font color=blue>**十字链表**</font>
24|25|26|27|28|29|30|31
递归类型|<font color=blue>**位段**</font>|逻辑算子|<font color=blue>**递回**</font>|<font color=blue>**递归可枚举**</font>|<font color=blue>**逻辑运算符**</font>|<font color=blue>**图算法**</font>|二叉索引树
32|33|34|35|36|37|38|39
<font color=blue>**数据类型**</font>|分段函数|<font color=blue>**后缀数组**</font>|<font color=blue>**函数**</font>|集合,数据结构|双端伫列|平行数组|可计算函数
40|41|42|43|44|45|46|47
符号表|实数连续统|布尔运算符|数学函数|索引节点|路径,图论|哈密顿路径问题|树,数据结构
48|49|50|51|52|53|54|55
控制结构|结构归纳法|递归论|引理|哈希|欧拉路径|双端队列|多重关连数组
56|57|58|59|60|61|62|63
<font color=blue>**迭代**</font>|<font color=blue>**优先级队列**</font>|演算法|霍夫曼算法|字串演算法|冒泡算法|<font color=blue>**计算机算法**</font>|序列化
64|65|66|67|68|69|70|71
堆,数据结构|<font color=blue>**状态机**</font>|逻辑联结词|实数域|容器,数据结构|匹配,图论|概率图模型|平面图,图论
72|73|74|75|76|77|78|79
哈密尔顿路径问题|超平面|子集合|<font color=blue>**循环队列**</font>|函数,数学|图的遍历|稀疏矩阵|<font color=blue>**矩阵**</font>
80|81|82|83|84|85|86|87
图遍历|自动变量|容器,抽象数据类型|全函数|朱迪矩阵|汽泡排序法|等价关系|整数
88|89|90|91|92|93|94|95
树,图论|线段树|多元函数|随机生成树|基数查找树|约束变量|偏递归函数|偏函数


In [27]:
print(predicted[:30])

['关键码', '字符', '向量', '复杂度', '运算符', '表达式', '指针', '概率', '红黑树', '操作数', '矩阵', '中位数', '数据元素', '时间复杂度', '最短路径', '左子', '变量', '控制权', '字符串', '无序', '叶子结点', '根结点', '右子', '操作符', '后缀', '内存', '对数', '单调性', '度量', '拓扑结构']


In [47]:
cross_validation = 'cluster_'
# seed_filename = 'input_data/seeds/xlore_seeds.txt'
seed_filename = 'input_data/seeds/seed_concepts'
folder_dict = {'': 'propagation_results', 'cluster_': 'cluster_results', 'rerank_': 'rerank_results'}
filename_seed_part = 'xlore_seed_' if seed_filename.endswith('xlore_seeds.txt') else ''
context_file = 'dsa_video_context_' if seed_filename.endswith('xlore_seeds.txt') else ''
with open(seed_filename) as f:
    seeds = set([line.strip() for line in f])
gold = set(load_json(gold_filename, lambda x: x['name']))
gold = set(gold) - seeds
print(f'processed_data/{folder_dict[cross_validation]}/' \
    f'{context_file}tf_idf_{filename_seed_part}nf_{cross_validation}result.json')
pd.DataFrame.from_dict(run_evaluate([
    f'processed_data/{folder_dict[cross_validation]}/'
    f'{context_file}{algorithm}{filename_seed_part}nf_{cross_validation}result.json' \
    for algorithm in ['tf_idf_', 'pagerank_', 'average_distance_', 'graph_prop_']]))

processed_data/cluster_results/tf_idf_nf_cluster_result.json


Unnamed: 0,tf_idf,pagerank,average_distance,graph_prop
mAP@100,0.891325,0.262437,0.755868,0.708277
p@100,0.9,0.49,0.81,0.77
mAP@200,0.671268,0.202665,0.53848,0.457731
p@200,0.72,0.405,0.625,0.535
mAP@500,0.325005,0.14783,0.310766,0.258997
p@500,0.394,0.34,0.442,0.374


In [71]:
algorithm = 'tf_idf_'
filename = f'processed_data/{folder_dict[cross_validation]}/' \
    f'{context_file}{algorithm}{filename_seed_part}nf_{cross_validation}result.json'
print(filename)
def find_cluster_start(filename):
    cluster_start = {}
    with open(filename) as f:
        for i, line in enumerate(f):
            cluster_num = int(json.loads(line)['cluster'])
            if cluster_num not in cluster_start:
                cluster_start[cluster_num] = i
    return cluster_start
def show_all_cases(filename, first_n):
    cluster_start = find_cluster_start(filename)
    print(cluster_start)
    my_gold = set(load_json(gold_filename, lambda x: x['name']))
    show_case(filename, my_gold, {}, n_start=320, n_shown=48)
#     for i in range(first_n):
#         show_case(filename, my_gold, {}, n_start=cluster_start[i+1], n_shown=48)
show_all_cases(filename, 4)

processed_data/cluster_results/tf_idf_nf_cluster_result.json
{1: 0, 2: 467, 3: 883, 4: 1092, 5: 1513, 6: 1982, 7: 2127, 8: 2474, 9: 2815, 10: 3356, 11: 3637, 12: 4028, 13: 4235, 14: 4703, 15: 5109}


0|1|2|3|4|5|6|7
---|---|---|---|---|---|---|---
320|321|322|323|324|325|326|327
<font color=blue>**汉诺塔**</font>|剖面图|图形|<font color=blue>**全排列**</font>|方法|分析|排除法|相似图形
328|329|330|331|332|333|334|335
独立同分布|分析机|法的实现|<font color=blue>**分组**</font>|定性分析|定界|第一性|可行域
336|337|338|339|340|341|342|343
甘特图|相邻关系|汇总|<font color=blue>**回溯法**</font>|图像恢复|优先|<font color=blue>**启发式**</font>|广义表
344|345|346|347|348|349|350|351
流程图|分析法|标准流程|耦合|参考模型|对称多处理|空间划分|资源分配
352|353|354|355|356|357|358|359
方案选择|构造器|分治策略|排队|<font color=blue>**参数化**</font>|基本结构|结构|<font color=blue>**流水线法**</font>
360|361|362|363|364|365|366|367
分析器|实例化|分支过程|<font color=blue>**先进先出**</font>|示意图|随机行走|任务并行|凸规划


In [22]:
gold

{' 图论算法',
 'KMP算法',
 'VLSI并行算法',
 'a*算法',
 'avl树',
 'b+树',
 'bellman-ford算法',
 'bsp模型',
 'b树',
 'c++',
 'c函数',
 'deque',
 'dijkstra算法',
 'distance',
 'dna计算',
 'd算法',
 'floyd-warshall算法',
 'floyd算法',
 'hash',
 'hash算法',
 'k-means算法',
 'kosaraju算法',
 'kruskal算法',
 'loop',
 'null值',
 'pad图',
 'postfix',
 'pram模型',
 'prim算法',
 'spfa算法',
 'tarjan算法',
 'ρ(n )近似算法',
 '一维数组',
 '下界',
 '不可近似性',
 '中位数',
 '中根遍历',
 '中缀',
 '中缀表达式',
 '串匹配',
 '乘法原理',
 '乱序',
 '二元关系',
 '二分图',
 '二分插入',
 '二分搜索',
 '二分查找',
 '二分法',
 '二叉堆',
 '二叉排序树',
 '二叉搜索树',
 '二叉查找树',
 '二叉树算法',
 '二叉树遍历',
 '二维数组',
 '二维网孔',
 '二路归并',
 '二进制数字',
 '二进制编码',
 '二部图',
 '交互计算',
 '代数系统',
 '众核计算',
 '优先级',
 '优先级队列',
 '伪代码',
 '伪多项式时间算法',
 '伪随机数',
 '似然函数',
 '位段',
 '例程',
 '信包选路问题',
 '信号流图',
 '倍增技术',
 '假溢出',
 '偏移量',
 '儿子',
 '元启发式优化算法',
 '元启发式算法',
 '元数据',
 '元组',
 '兄弟',
 '兄弟结点',
 '先序遍历',
 '先根遍历',
 '先进先出',
 '免疫遗传算法',
 '入队',
 '全排列',
 '关联数组',
 '关键字',
 '关键码',
 '关键词',
 '关键路径法',
 '内点',
 '内点算法',
 '内部排序',
 '冯诺依曼',
 '决策树',
 '函数',
 '函数式编程',
 '函数模型',
 '函数调用语句',
 '分布式排序算