In [3]:
from collections import Counter
import pandas as pd

In [5]:
column_names = {
    "qid": "qid",
    "1": "tf_of_body",
    "2": "tf_of_anchor",
    "3": "tf_of_title",
    "4": "tf_of_url",
    "5": "tf_of_whole_document",
    "6": "idf_of_body",
    "7": "idf_of_anchor",
    "8": "idf_of_title",
    "9": "idf_of_url",
    "10": "idf_of_whole_document",
    "11": "tfidf_of_body",
    "12": "tfidf_of_anchor",
    "13": "tfidf_of_title",
    "14": "tfidf_of_url",
    "15": "tfidf_of_whole_document",
    "16": "dl_of_body",
    "17": "dl_of_anchor",
    "18": "dl_of_title",
    "19": "dl_of_url",
    "20": "dl_of_whole_document",
    "21": "bm25_of_body",
    "22": "bm25_of_anchor",
    "23": "bm25_of_title",
    "24": "bm25_of_url",
    "25": "bm25_of_whole_document",
    "26": "lmir_abs_of_body",
    "27": "lmir_abs_of_anchor",
    "28": "lmir_abs_of_title",
    "29": "lmir_abs_of_url",
    "30": "lmir_abs_of_whole_document",
    "31": "lmir_dir_of_body",
    "32": "lmir_dir_of_anchor",
    "33": "lmir_dir_of_title",
    "34": "lmir_dir_of_url",
    "35": "lmir_dir_of_whole_document",
    "36": "lmir_jm_of_body",
    "37": "lmir_jm_of_anchor",
    "38": "lmir_jm_of_title",
    "39": "lmir_jm_of_url",
    "40": "lmir_jm_of_whole_document",
    "41": "page_rank",
    "42": "inlink_number",
    "43": "outlink_number",
    "44": "number_of_slash_in_url",
    "45": "length_of_url",
    "46": "number_of_child_page"
}

In [20]:
def parse_feature(data: str) -> dict:
    items = data.split("#")
    key_values = items[0].strip().split()

    feature = dict()
    for key_value in key_values[1:]:
        key, value = key_value.split(":")
        feature[column_names[key]] = float(value)
    feature["qid"] = int(feature["qid"])
    feature["label"] = int(key_values[0])
    feature["label_norm"] = int(feature["label"] > 0)

    params = items[1].strip().replace(" = ", "=").split()
    for param in params:
        key, value = param.split("=")
        if key == "docid":
            feature[key] = value
        else:
            feature[key] = float(value)
    
    return feature

In [21]:
def load_data(filename: str) -> pd.DataFrame:
    features = []
    with open(filename) as f:
        for line in f:
            feature = parse_feature(line)
            features.append(feature)
    
    return pd.DataFrame(features)

In [22]:
filename = "dataset/MQ2008/S1.txt"
df1 = load_data(filename)
df1.shape

(2933, 52)

In [23]:
df1.head(3)

Unnamed: 0,qid,tf_of_body,tf_of_anchor,tf_of_title,tf_of_url,tf_of_whole_document,idf_of_body,idf_of_anchor,idf_of_title,idf_of_url,...,inlink_number,outlink_number,number_of_slash_in_url,length_of_url,number_of_child_page,label,label_norm,docid,inc,prob
0,10002,0.007477,0.0,1.0,0.0,0.00747,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.007042,0,0,GX008-86-4444840,1.0,0.086622
1,10002,0.603738,0.0,1.0,0.0,0.603175,0.0,0.0,0.0,0.0,...,0.28,0.0,0.003708,0.333333,1.0,0,0,GX037-06-11625428,0.003159,0.089745
2,10002,0.214953,0.0,0.0,0.0,0.213819,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.021127,0,0,GX044-30-4142998,0.008419,0.099974


In [26]:
# qid ごとの impression 数を見る。
df1.qid.value_counts()

10078    118
10419    118
11893    118
11759    118
11565    114
        ... 
11041      7
11777      7
11699      7
11092      7
11297      6
Name: qid, Length: 157, dtype: int64

In [27]:
df1.label.value_counts()

0    2316
1     427
2     190
Name: label, dtype: int64

In [28]:
df1.label_norm.value_counts()

0    2316
1     617
Name: label_norm, dtype: int64

In [58]:
# interaction を含まない qid の確認
Counter([df1[df1.qid == qid].label_norm.sum() for qid in df1.qid.unique()])

Counter({0: 52,
         2: 21,
         3: 13,
         7: 5,
         1: 25,
         5: 11,
         6: 3,
         22: 2,
         10: 1,
         4: 10,
         8: 3,
         13: 1,
         37: 1,
         15: 1,
         17: 1,
         25: 1,
         16: 1,
         63: 1,
         19: 1,
         14: 1,
         55: 1,
         11: 1})

- qid によって、impression 数が異なる。
- interaction を含まない qid がある。

In [29]:
filename = "dataset/MQ2008/S2.txt"
df2 = load_data(filename)
df2.shape

(3635, 52)

In [31]:
df2.qid.unique().shape

(157,)

In [36]:
set(df2.qid.unique()) & set(df1.qid.unique())

set()

In [37]:
filename = "dataset/MQ2008/S3.txt"
df3 = load_data(filename)
df3.shape

(3062, 52)

In [38]:
filename = "dataset/MQ2008/S4.txt"
df4 = load_data(filename)
df4.shape

(2707, 52)

In [39]:
filename = "dataset/MQ2008/S5.txt"
df5 = load_data(filename)
df5.shape

(2874, 52)

In [41]:
# 共通の qid を探す。
dfs = [df1, df2, df3, df4, df5]
for i, dfi in enumerate(dfs, start=1):
    for j, dfj in enumerate(dfs, start=1):
        if i >= j:
            continue

        result = set(dfi.qid.unique()) & set(dfj.qid.unique())
        print(f"df{i} & df{j} = {len(result)}")

df1 & df2 = 0
df1 & df3 = 0
df1 & df4 = 0
df1 & df5 = 0
df2 & df3 = 0
df2 & df4 = 0
df2 & df5 = 0
df3 & df4 = 0
df3 & df5 = 0
df4 & df5 = 0


In [42]:
# 共通の docid を探す。
dfs = [df1, df2, df3, df4, df5]
for i, dfi in enumerate(dfs, start=1):
    for j, dfj in enumerate(dfs, start=1):
        if i >= j:
            continue

        result = set(dfi.docid.unique()) & set(dfj.docid.unique())
        print(f"df{i} & df{j} = {len(result)}")

df1 & df2 = 67
df1 & df3 = 80
df1 & df4 = 67
df1 & df5 = 79
df2 & df3 = 72
df2 & df4 = 86
df2 & df5 = 85
df3 & df4 = 64
df3 & df5 = 54
df4 & df5 = 81


In [44]:
docid_counter = Counter(
    df1.docid.tolist() + 
    df2.docid.tolist() +
    df3.docid.tolist() +
    df4.docid.tolist() +
    df5.docid.tolist()
)

In [47]:
docid_counter.most_common(10)

[('GX000-00-0000000', 22),
 ('GX006-16-15305255', 14),
 ('GX255-30-4478761', 12),
 ('GX014-02-11418175', 8),
 ('GX009-85-4054530', 7),
 ('GX014-13-13676661', 7),
 ('GX000-12-12356259', 6),
 ('GX239-75-11286672', 6),
 ('GX238-27-15726674', 6),
 ('GX245-76-13660955', 6)]

In [51]:
# docid の出現回数のヒストグラム。ほとんどのドキュメントは１回しか出現しない。
pd.Series(docid_counter.values()).value_counts()

1     13812
2       438
3        82
4        28
5        13
6         5
7         2
8         1
12        1
14        1
22        1
dtype: int64

In [52]:
pos_docid_counter = Counter(
    df1[df1.label_norm == 1].docid.tolist() + 
    df2[df2.label_norm == 1].docid.tolist() +
    df3[df3.label_norm == 1].docid.tolist() +
    df4[df4.label_norm == 1].docid.tolist() +
    df5[df5.label_norm == 1].docid.tolist()
)

In [54]:
pos_docid_counter.most_common(10)

[('GX027-80-5095264', 3),
 ('GX014-76-3025378', 3),
 ('GX001-40-13788000', 3),
 ('GX012-07-6597432', 2),
 ('GX264-55-16328787', 2),
 ('GX240-62-9832512', 2),
 ('GX244-70-15324710', 2),
 ('GX225-75-12296992', 2),
 ('GX270-81-2444961', 2),
 ('GX000-05-9058226', 2)]

- ランキングに現れているドキュメントと実際に interaction が発生しているドキュメントに乖離がある。
  - おそらく、クエリごとにクリックされるドキュメントの傾向が異なる。
- test set, train set, validation set で共通のqidはない。同じ docid は現れる。

その他
- 評価スクリプトを見ると、predict（推定結果）をもとに label を並び替えて、label を集計している。
  - impression に interaction が含まれない場合は、0 スコアになるので、全体的に高いスコアにはならないと思われる。。

下の論文では、60-70%ぐらいの値が出ている。
- https://www.ecmlpkdd2019.org/downloads/paper/400.pdf