In [65]:
#!/usr/bin/python
import numpy as np
import scipy.sparse
import pickle
import xgboost as xgb
import csv
from collections import defaultdict
import redis
import json
import time
import sklearn.utils
### simple example
# load file from text file, also binary buffer generated by xgboost
# dtrain = xgb.DMatrix('./demo/rank_aida/mq2008.train')

# dtest = xgb.DMatrix('./demo/rank_aida/mq2008.test')

In [66]:
conn = redis.Redis(password='unswkg', host='43.240.97.73', port=8379, decode_responses=True)

def get_feature_results(exp_id, d_t, data_source='aida_conll'):
    fea_res = conn.lrange('result:::::' + str(exp_id) + ':::::' + data_source + ':::::' + str(d_t), 0, -1)
    fea_vecs = [json.loads(res) for res in fea_res]
    return fea_vecs

In [67]:
def get_dataset_info(exp_id, d_t, data_source='aida_conll', can_size=50):
    valid_mens_size = conn.scard('valid_qry_ids:::::' + str(exp_id) + ':::::' + data_source + ':::::' + str(d_t))
    no_g_can_info_size = conn.scard('no_g_candidate_info:::::' + str(exp_id) + ':::::' + data_source + ':::::' + str(d_t))
    missed_g_candidate_size = conn.scard('missed_g_candidate:::::' + str(exp_id) + ':::::' + data_source + 
                                         ':::::' + str(d_t) + ':::::' + str(can_size))
    no_candidate_size = conn.scard('no_candidate:::::' + str(exp_id) + ':::::' + data_source + ':::::' + str(d_t))
    return valid_mens_size, no_candidate_size, no_g_can_info_size, missed_g_candidate_size

In [68]:
def fetch_all_features(exp_id, data_type, data_source='aida_conll'):
    res_feas = get_feature_results(exp_id, data_type, data_source)
    res_feas_ids = [[res[0].strip('(').split(', ')[0]] + res[1:-1] + [res[-1]] for res in res_feas]
    res_feas_ids = np.array(res_feas_ids, dtype=np.float64)
    return res_feas_ids

In [69]:
def trans_data(data):
    d_np = data[:, 1:-1]
    print(d_np)
    d_labels = data[:, -1]
    print(d_labels)
    idxs = np.where(d_labels == 1)[0]
    d_groups = np.append(np.delete(idxs, 0), len(d_labels)) - idxs
    print(d_groups)
    xgb_data = xgb.DMatrix(data=d_np, label=d_labels)
    xgb_data.set_group(d_groups)
    return xgb_data

In [45]:
def evalerror(preds, dt, d_tal_size):
    d_l = dt.get_label()
    idxs = np.where(d_l == 1)[0]
    d_groups = np.append(np.delete(idxs, 0), len(d_l)) - idxs
    matched_ids = []
    q_id = 0
    for x in d_groups:
        pre_res = preds[q_id: x + q_id]
        if(preds[q_id] == max(pre_res)):
            if len([x for x in pre_res if x == preds[q_id]]) == 1:
                matched_ids.append(q_id)
        q_id += x
    precision = float(len(matched_ids)) / len(d_groups)
    recall = float(len(matched_ids)) / d_tal_size
    f1 = 2 * precision * recall / (precision + recall)
    return len(matched_ids), precision, recall, f1

In [58]:
def build_doc_fea_vecs_dict(data, doc_men_dict):
    doc_fea_vecs_dict = {}
    st = time.time()
    for key, val in doc_men_dict.items():
        ix = np.isin(data[:, 0], val)
        doc_fea_vecs_dict[key] = data[np.where(ix)[0],]
    print(time.time() - st)
    return doc_fea_vecs_dict

def data_sampling_boostrapping(data, doc_men_dict, doc_fea_vecs_dict, n_bootstraps=3):
    datas = []
#     doc_fea_vecs_dict = {}
#     st = time.time()
#     i = 0
#     for key, val in doc_men_dict.items():
#         i += 1
#         if not i %1000:
#             print(i)
# #         print(val)
#         ix = np.isin(data[:, 0], val)
# #         print(np.where(ix)[0])
#         doc_fea_vecs_dict[key] = data[np.where(ix)[0],]
#     print(time.time() - st)
    
    doc_ids_list = list(doc_fea_vecs_dict.keys())
    
    print(len(doc_ids_list))
    for n_bootstrap in range(0, n_bootstraps):
        print("Doing bootstrapping for the {} time...".format(n_bootstrap + 1))
        n_bootstrap = 0    
        doc_id_smps_training = sklearn.utils.resample(doc_ids_list, random_state=n_bootstrap)
        print("Distinct training doc size {}...".format(len(set(doc_id_smps_training))))
        print("Constructing training mentions...")
        st = time.time()
        mens_training = np.concatenate([doc_fea_vecs_dict[x] for x in doc_id_smps_training])
        print("Finished Constructing training mentions, time cost {}...".format(time.time() - st))
        doc_id_smps_testing = [x for x in doc_ids_list if x not in doc_id_smps_training]
        print("Distinct testing doc size {}...".format(len(set(doc_id_smps_testing))))
        st = time.time()
        mens_testing = np.concatenate([doc_fea_vecs_dict[x] for x in doc_id_smps_testing])
        print("Finished Constructing testing mentions, time cost {}...".format(time.time() - st))
        datas.append((mens_training, mens_testing))
    return datas
#     men_ids_training = [x for xs in doc_id_smps_training for x in doc_men_dict[xs]]
#     print("Training mentions size {}...".format(len(men_ids_training)))

#     men_ids_testing = [x for xs in doc_id_smps_testing for x in doc_men_dict[xs]]
#     print("Testing mentions size {}...".format(len(men_ids_testing)))

#     men_ids_training = np.array(men_ids_training, dtype=np.float64)
#     men_ids_testing = np.array(men_ids_testing, dtype=np.float64)
#         for idx in men_ids_training:
#     print("Constructing training mentions...")
#         s = []
#     st = time.time()
#     a = 0
#     for id_, idx in enumerate(men_ids_training):
#         if not id_ %1000:
#             print(time.time() - st)
#             st = time.time()
#             print(a)
#         a += len(np.where(data[:, 0] == idx)[0])
#     print(a)
#     return
#         for x in 
#         for id_, idx in enumerate(men_ids_training):
#             if not id_ % 1000:
#                 print(time.time() - st)
#                 st = time.time()
#             for x in np.where(data[:, 0] == idx)[0]:
#                 s.append(data[x,])
#     mens_training = np.array([data[x, ] for idx in men_ids_training for x in np.where(data[:, 0] == idx)[0]])
#     print("Finished Constructing training mentions, time cost {}...".format(time.time() - st))
#     print("Constructing testing mentions...")
#     st = time.time()
#     mens_testing = np.array([data[x, ] for idx in men_ids_testing for x in np.where(data[:, 0] == idx)[0]])
#     print("Finished Constructing testing mentions, time cost {}...".format(time.time() - st))
    datas.append((mens_training, mens_testing))
    return mens_training, mens_testing

In [47]:
def read_doc_men_dict_file(dict_filename):
    doc_men_dict = defaultdict(list)
    with open(dict_filename, 'r') as f:
        csv_reader = csv.reader(f)
        for k, v in csv_reader:
            doc_men_dict[k].append(float(v))
    return doc_men_dict

In [8]:
d_train = fetch_all_features('basic_feature_phrases_tf_idf_entropy_reduced_fea', 'train', 'wiki_uiuc')

In [48]:
valid_mens_size, no_candidate_size, no_g_can_info_size, missed_g_candidate_size = get_dataset_info('basic_feature_phrases_tf_idf_entropy_reduced_fea', 'test', 'wiki_uiuc')

In [49]:
dict_filename = './wiki_uiuc_training_doc_id_men_id.csv'

In [50]:
doc_men_dict = read_doc_men_dict_file(dict_filename)
doc_fea_vecs_dict = build_doc_fea_vecs_dict(d_train, doc_men_dict)

408.1616179943085


NameError: name 'doc_ids_list' is not defined

In [59]:
d_train_resample, d_test_resample = data_sampling_boostrapping(d_train, doc_men_dict, doc_fea_vecs_dict)

9938
Distinct training doc size 6295...
Constructing training mentions...
Finished Constructing training mentions, time cost 0.13133621215820312...
Distinct testing doc size 3643...
Finished Constructing testing mentions, time cost 0.023510217666625977...


In [63]:
dtrain = trans_data(d_train_resample)
dtest = trans_data(d_test_resample)
# print(dtrain_g)
# print(dtest_g)

[[  4.34889436e-01   8.00000000e+00   4.34889436e-01 ...,   7.06169561e-04
    8.95117505e-03   8.45491765e-02]
 [  2.87469298e-01   8.00000000e+00   4.34889436e-01 ...,   1.66394036e-03
    1.30014895e-02   1.06551622e-01]
 [  1.03194103e-01   8.00000000e+00   4.34889436e-01 ...,   5.80435908e-03
    3.69313494e-02   1.53816297e-01]
 ..., 
 [  9.97191012e-01   2.00000000e+00   9.97191012e-01 ...,   2.13496451e-03
    1.06748225e-02   6.26477541e-02]
 [  2.80898879e-03   2.00000000e+00   9.97191012e-01 ...,   1.17589842e-03
    1.17589842e-03   3.90625000e-02]
 [  1.00000000e+00   1.00000000e+00   1.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]]
[ 1.  0.  0. ...,  1.  0.  1.]
[8 1 2 ..., 1 2 1]
[[  9.44444418e-01   2.00000000e+00   9.44444418e-01 ...,   2.57422606e-03
    2.00789633e-02   1.86762442e-01]
 [  5.55555560e-02   2.00000000e+00   9.44444418e-01 ...,   9.49621437e-04
    2.37405359e-03   6.94006309e-02]
 [  1.00000000e+00   1.00000000e+00   1.000000

In [19]:
n_estimators = range(100, 5000, 400)
max_depth = [4, 6, 8, 10]

In [20]:
for x in n_estimators:
    num_round = x
    for dep in max_depth:
        param = {'max_depth':dep, 'eta':0.01, 'silent':0, 'objective':'rank:pairwise', 'min_child_weight':0.01}
        bst = xgb.train(param, dtrain, num_round)
        preds = bst.predict(dtrain)
        a = evalerror(preds, dtrain, 18541)
        print("n_estimators: {}, max_depth: {}, acc_training: {}, corr_num: {}".format(num_round, dep, a[0], a[1]))
        preds = bst.predict(dtest)
        a = evalerror(preds, dtest, 4485)
        print("n_estimators: {}, max_depth: {}, acc_testing: {}, corr_num: {}".format(num_round, dep, a[0], a[1]))
        print("---------------------------------")

n_estimators: 100, max_depth: 4, acc_training: 0.7711558168383582, corr_num: 14298
n_estimators: 100, max_depth: 4, acc_testing: 0.7163879598662207, corr_num: 3213
---------------------------------
n_estimators: 100, max_depth: 6, acc_training: 0.8203441022598565, corr_num: 15210
n_estimators: 100, max_depth: 6, acc_testing: 0.7868450390189521, corr_num: 3529
---------------------------------
n_estimators: 100, max_depth: 8, acc_training: 0.879564209050213, corr_num: 16308
n_estimators: 100, max_depth: 8, acc_testing: 0.8133779264214047, corr_num: 3648
---------------------------------
n_estimators: 100, max_depth: 10, acc_training: 0.9209859230893696, corr_num: 17076
n_estimators: 100, max_depth: 10, acc_testing: 0.8251950947603122, corr_num: 3701
---------------------------------
n_estimators: 500, max_depth: 4, acc_training: 0.8502238282724772, corr_num: 15764
n_estimators: 500, max_depth: 4, acc_testing: 0.8013377926421404, corr_num: 3594
---------------------------------
n_estimat

In [78]:
num_round=4900
param = {'max_depth':6, 'eta':0.01, 'silent':0, 'objective':'rank:pairwise', 'min_child_weight':0.01}
model = xgb.train(param, dtrain, num_round)

In [64]:
d_train = fetch_all_features('basic_feature_phrases_tf_idf_entropy_conll_new', 'train', 'aida_conll')
d_test = fetch_all_features('basic_feature_phrases_tf_idf_entropy_conll_new', 'test', 'aida_conll')

KeyboardInterrupt: 

In [79]:
model_name = './no_poly_4900_6.mdl'
# with open(model_name, 'wb') as f:
#     pickle.dump(model, f)
model = pickle.load(open(model_name, 'rb'))
preds = model.predict(dtrain)
a = evalerror(preds, dtrain, 18541)
print("n_estimators: {}, max_depth: {}, acc_training: {}, corr_num: {}".format(num_round, 6, a[0], a[1]))
preds = model.predict(dtest)
a = evalerror(preds, dtest, 148)
print("n_estimators: {}, max_depth: {}, acc_testing: {}, corr_num: {}".format(num_round, 6, a[0], a[1]))
print("---------------------------------")

n_estimators: 4900, max_depth: 6, acc_training: 0.9743811013429696, corr_num: 18066
n_estimators: 4900, max_depth: 6, acc_testing: 0.4864864864864865, corr_num: 72
---------------------------------


In [89]:
import sklearn.utils

In [90]:
wiki_idxs_all = range(0, 9938)

In [113]:
training_idx = sklearn.utils.resample(wiki_idxs_all)

In [117]:
test_idx = range(0, 5)

In [120]:
w = [1, 2,3,4,5]

sum(w[:0])

0

In [124]:
d_test = fetch_all_features('basic_feature_phrases_tf_idf_entropy_reduced_fea', 'test', 'kore')

In [146]:
print(set(a))

{1.0, 2.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 16.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 29.0, 31.0, 32.0, 33.0, 34.0, 35.0, 37.0, 38.0, 39.0, 40.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 62.0, 64.0, 65.0, 68.0, 69.0, 71.0, 72.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 81.0, 84.0, 85.0, 86.0, 87.0, 88.0, 90.0, 91.0, 92.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 104.0, 105.0, 106.0, 110.0, 112.0, 113.0, 114.0, 115.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 130.0, 131.0, 132.0, 133.0, 134.0, 135.0, 136.0, 137.0, 138.0, 139.0, 141.0, 142.0, 143.0, 144.0, 145.0, 146.0, 147.0}


In [136]:
[x for x in range(0, 2)]

[0, 1]

In [147]:
a = [(1,2), (1, 3)]

In [153]:
from collections import defaultdict
d = defaultdict(list)
for k, v in a:
    d[k].append(v)

In [155]:
import csv
file = '/Users/dzs/Desktop/doc_id2men_id.csv'

with open(file, 'r') as f:
    csv_reader = csv.reader(f)
    for item in csv_reader:
        print(item)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/dzs/Desktop/doc_id2men_id.csv'

In [219]:
import numpy
a = numpy.empty(shape=(5,2))

In [220]:
a = [1,2,3,4]

array([[  4.32801506e-321,   4.33295571e-321],
       [  4.33789637e-321,   4.34283703e-321],
       [  4.34777768e-321,   4.35271834e-321],
       [  4.35765900e-321,   4.36259965e-321],
       [  4.36754031e-321,   4.37248097e-321]])

In [231]:
np.__version__

'1.11.3'

In [32]:
a = np.array([[1,2], [5, 6]])
b = np.array([[3,4]])
c= [a, b]

In [33]:
c

[array([[1, 2],
        [5, 6]]), array([[3, 4]])]

In [34]:
np.concatenate(c, axis=0)

array([[1, 2],
       [5, 6],
       [3, 4]])