In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
from tqdm.auto import tqdm

In [3]:
sub = pd.read_csv("lgb_sub_record_id.csv")
train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")

In [5]:
sub.ordered_scores = [eval(x) for x in sub.ordered_scores]
sub.ordered_record = [eval(x) for x in sub.ordered_record]

In [18]:
sub = pd.read_csv("restricted_df.csv")

In [6]:
# Get :10
sub['predicted_record_id'] = [x[:10] for x in sub.ordered_record]

In [7]:
test['linked_id'] = test.record_id.str.split("-")
test['linked_id'] = test.linked_id.apply(lambda x: x[0])
test['linked_id'] = test.linked_id.astype(int)
train['linked_id'] = train.linked_id.astype(int)

In [8]:
test

Unnamed: 0,record_id,name,type,address,phone,email,modification,linked_id
0,10051937-TST-MR,CHEYHAM CORPORATION LIMITED,entity,,1.986537e+10,sales@protonmail.com,move unique,10051937
1,10070762-TST-MR,MEABIZ LIMITED,entity,,3.374033e+11,,move unique,10070762
2,10000304-TST-MR,CHAMPLE INVESTMENTS LTD.,entity,,4.142094e+12,,move unique,10000304
3,10151607-TST-MR,Concept I Media and Technology Company Limited,entity,,1.480354e+10,,move unique,10151607
4,10124701-TST-MR,ARRA RESOURCES LIMITED,entity,,4.207630e+11,,move unique,10124701
5,10140018-TST-MR,RONCO INVESTMENT ESTATES CORP.,entity,,3.326372e+11,,move unique,10140018
6,10012709-TST-MR,MIREN ENTERPRISES S.A.,entity,,,MIRENENTERPRISESSA@outlook.it,move unique,10012709
7,10127252-TST-MR,WHITE CONCEPT GROUP LIMITED,entity,,1.801344e+10,,move unique,10127252
8,12167036-TST-MR,ZHANG ZI TONG,officer,No. 102; Unit 11; 5th Floor; Tu Er hu tong; Do...,4.209771e+11,ZHANGZITONG@icloud.de,move unique,12167036
9,10088906-TST-MR,FRESHCAM CORP.,entity,,,,move unique,10088906


In [9]:
def recall_at_k(resultTable : pd.DataFrame, trainingData: pd.DataFrame, testingData: pd.DataFrame) -> dict:
    """
    Given a list of K predictions for each query, first retrieve the correct ID from the test data,
    then look in the training data the percentage of records that have been successfully identified.
    
    For example, given query "1234-M", first retrieve the correct ID "1234" from the test data,
    then obtain from the training data all records that refer to "1234", 
    and finally look how many of them we have found;
    """
    
    # Obtain all the predictions for each record in the test set;
    perQueryRecords = resultTable.groupby("queried_record_id")
    
    # Group training records by their LinkedID truth value;
    groupedTrainingRecords = trainingData.groupby("linked_id")

    totalRecall = 0.0

    allRecords = dict()
    
    start = time.time()
    for i, (queriedRecordID, group) in enumerate(perQueryRecords):
        if i % 1000 == 0 and i > 0:
            print(f"processed {i}/{len(perQueryRecords)} records, {100 * i / len(perQueryRecords):.2f}%")
            print(f"\tcurrent recall: {(totalRecall / i):.2f}")
            print(f"\ttime elapsed: {(time.time() - start):.2f} s")
        
        try:
            queriedLinkedID = testingData.loc[queriedRecordID, "linked_id"]
        except IndexError:
            raise IndexError(f"ID {queriedRecordID} not found in testing data!")
        
        try:
            allRelevantRecords = set(groupedTrainingRecords.get_group(queriedLinkedID).index.values)
        except KeyError:
            allRelevantRecords = set()
        setPredictedRecords = set(group["predicted_record_id"])
        selectedRelevantRecords = setPredictedRecords.intersection(allRelevantRecords)
        recall = 1
        if (len(allRelevantRecords) > 0):
            recall = len(selectedRelevantRecords) / len(allRelevantRecords)

        totalRecall += recall
        allRecords[queriedRecordID] = [queriedRecordID, recall, len(selectedRelevantRecords), len(allRelevantRecords)]
    
    # Store the results in a summary table;
    result_table =  pd.DataFrame.from_dict(
                        allRecords,
                        orient='index',
                        columns=["QueriedRecordID", "Recall@K", "SelectedRecords", "AllRelevantRecords"]
                    )
    # Compute the filtered recall, which considers only queries with at least one relevant record in the training data;
    queries_with_relevant_records = result_table[result_table["AllRelevantRecords"] > 0]
    filtered_recall = np.mean(queries_with_relevant_records["SelectedRecords"] / queries_with_relevant_records["AllRelevantRecords"])

    return {
            "AverageRecall" : totalRecall / len(perQueryRecords),
            "AverageFilteredRecall": filtered_recall,
            "perQueryResult" : result_table
            }
    
def precision_at_k(resultTable : pd.DataFrame, trainingData: pd.DataFrame, testingData: pd.DataFrame) -> dict:
    """
    Given a list of K predictions for each query, first retrieve the correct ID from the test data,
    then look in the training data the percentage of records that are actually relevant;
    
    For example, given query "1234-M", first retrieve the correct ID "1234" from the test data,
    then obtain from the training data all records that refer to "1234", 
    and finally look how many of the records we have found are actually referring to "1234"
    """
    
    # Obtain all the predictions for each record in the test set;
    perQueryRecords = resultTable.groupby("queried_record_id")
    
    # Group training records by their LinkedID truth value;
    groupedTrainingRecords = trainingData.groupby("linked_id")

    totalPrecision = 0.0
    numberOfPredictionsForRelevantRecords = 0

    allRecords = dict()
    
    start = time.time()
    for i, (queriedRecordID, group) in enumerate(perQueryRecords):
        if i % 1000 == 0 and i > 0:
            print(f"processed {i}/{len(perQueryRecords)} records, {100 * i / len(perQueryRecords):.2f}%")
            print(f"\tcurrent precision: {(totalPrecision / i):.2f}")
            print(f"\ttime elapsed: {(time.time() - start):.2f} s")
        
        try:
            queriedLinkedID = testingData.loc[queriedRecordID, "linked_id"]
        except IndexError:
            raise IndexError(f"ID {queriedRecordID} not found in testing data!")
        
        try:
            allRelevantRecords = set(groupedTrainingRecords.get_group(queriedLinkedID).index.values)
        except KeyError:
            allRelevantRecords = set()
        setPredictedRecords = set(group["predicted_record_id"])
        selectedRelevantRecords = setPredictedRecords.intersection(allRelevantRecords)
        precision = 1
        if (len(allRelevantRecords) > 0):
            precision = len(selectedRelevantRecords) / len(setPredictedRecords)
            numberOfPredictionsForRelevantRecords += len(setPredictedRecords)

        totalPrecision += precision
        allRecords[queriedRecordID] = [queriedRecordID, precision, len(selectedRelevantRecords), len(allRelevantRecords)]
    
    # Store the results in a summary table;
    result_table =  pd.DataFrame.from_dict(
                        allRecords,
                        orient='index',
                        columns=["QueriedRecordID", "Precision@K", "SelectedRecords", "AllRelevantRecords"]
                    )
    # Compute the filtered recall, which considers only queries with at least one relevant record in the training data;
    queries_with_relevant_records = result_table[result_table["AllRelevantRecords"] > 0]
    filtered_precision = np.mean(queries_with_relevant_records["SelectedRecords"] / numberOfPredictionsForRelevantRecords)

    return {
            "AveragePrecision" : totalPrecision / len(perQueryRecords),
            "AverageFilteredPrecision": filtered_precision,
            "perQueryResult" : result_table
            }   


In [10]:
def expand_df(df):
    df.predicted_record_id = df.predicted_record_id.str.split(" ")
    new_df = []
    for (q,p) in tqdm(zip(df.queried_record_id, df.predicted_record_id)):
        for x in p:
            new_df.append((q,x))
            
    new_df = pd.DataFrame(new_df, columns=['queried_record_id', 'predicted_record_id'])
    return new_df

def expand_lists(df):
    new_df = []
    for (q,p) in tqdm(zip(df.queried_record_id, df.predicted_record_id)):
        for x in p:
            new_df.append((q,x))
            
    new_df = pd.DataFrame(new_df, columns=['queried_record_id', 'predicted_record_id'])
    return new_df

In [11]:
#sub = expand_df(sub)
sub = expand_lists(sub[['queried_record_id', 'predicted_record_id']])
sub

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,queried_record_id,predicted_record_id
0,10000003-TST-MR,10127532-M1
1,10000003-TST-MR,10127532
2,10000003-TST-MR,10127532-M0
3,10000003-TST-MR,10127532-NV2
4,10000003-TST-MR,10127532-M2
5,10000003-TST-MR,10127532-NV1
6,10000003-TST-MR,10127532-NV0
7,10000003-TST-MR,10144478-M2
8,10000003-TST-MR,10144478-M1
9,10000003-TST-MR,10144478-M0


# Recall e precision con i LINKED_ID

In [9]:
recall_at_k(sub, train.set_index('record_id'), test.set_index('record_id'))

processed 1000/266955 records, 0.37%
	current recall: 0.42
	time elapsed: 8.06 s
processed 2000/266955 records, 0.75%
	current recall: 0.43
	time elapsed: 8.36 s
processed 3000/266955 records, 1.12%
	current recall: 0.43
	time elapsed: 8.67 s
processed 4000/266955 records, 1.50%
	current recall: 0.43
	time elapsed: 8.98 s
processed 5000/266955 records, 1.87%
	current recall: 0.43
	time elapsed: 9.28 s
processed 6000/266955 records, 2.25%
	current recall: 0.43
	time elapsed: 9.58 s
processed 7000/266955 records, 2.62%
	current recall: 0.43
	time elapsed: 9.88 s
processed 8000/266955 records, 3.00%
	current recall: 0.43
	time elapsed: 10.18 s
processed 9000/266955 records, 3.37%
	current recall: 0.44
	time elapsed: 10.48 s
processed 10000/266955 records, 3.75%
	current recall: 0.44
	time elapsed: 10.78 s
processed 11000/266955 records, 4.12%
	current recall: 0.44
	time elapsed: 11.08 s
processed 12000/266955 records, 4.50%
	current recall: 0.44
	time elapsed: 11.39 s
processed 13000/2669

processed 195000/266955 records, 73.05%
	current recall: 0.44
	time elapsed: 67.28 s
processed 196000/266955 records, 73.42%
	current recall: 0.44
	time elapsed: 67.60 s
processed 197000/266955 records, 73.80%
	current recall: 0.44
	time elapsed: 67.89 s
processed 198000/266955 records, 74.17%
	current recall: 0.44
	time elapsed: 68.21 s
processed 199000/266955 records, 74.54%
	current recall: 0.45
	time elapsed: 68.51 s
processed 200000/266955 records, 74.92%
	current recall: 0.45
	time elapsed: 68.80 s
processed 201000/266955 records, 75.29%
	current recall: 0.45
	time elapsed: 69.11 s
processed 202000/266955 records, 75.67%
	current recall: 0.45
	time elapsed: 69.42 s
processed 203000/266955 records, 76.04%
	current recall: 0.45
	time elapsed: 69.72 s
processed 204000/266955 records, 76.42%
	current recall: 0.45
	time elapsed: 70.05 s
processed 205000/266955 records, 76.79%
	current recall: 0.45
	time elapsed: 70.35 s
processed 206000/266955 records, 77.17%
	current recall: 0.45
	ti

{'AverageRecall': 0.4441422711692982,
 'AverageFilteredRecall': 0.0,
 'perQueryResult':                          QueriedRecordID  Recall@K  SelectedRecords  \
 10000003-TST-MR          10000003-TST-MR       1.0                0   
 10000008-TST-M            10000008-TST-M       1.0                0   
 10000010-TST-CP          10000010-TST-CP       0.0                0   
 10000013-TST-MR          10000013-TST-MR       1.0                0   
 10000016-TST-MR          10000016-TST-MR       1.0                0   
 10000017-TST-MR          10000017-TST-MR       1.0                0   
 10000018-T4-TST-CP    10000018-T4-TST-CP       0.0                0   
 10000018-TST-M            10000018-TST-M       0.0                0   
 10000020-NV0-TST-CP  10000020-NV0-TST-CP       0.0                0   
 10000020-NV1-TST-M    10000020-NV1-TST-M       0.0                0   
 10000022-TST-CP          10000022-TST-CP       0.0                0   
 10000023-NV2-TST-CP  10000023-NV2-TST-CP       0

In [15]:
precision_at_k(sub, train.set_index('record_id'), test.set_index('record_id'))

processed 1000/266955 records, 0.37%
	current precision: 0.51
	time elapsed: 9.87 s
processed 2000/266955 records, 0.75%
	current precision: 0.53
	time elapsed: 10.35 s
processed 3000/266955 records, 1.12%
	current precision: 0.52
	time elapsed: 10.84 s
processed 4000/266955 records, 1.50%
	current precision: 0.53
	time elapsed: 11.34 s
processed 5000/266955 records, 1.87%
	current precision: 0.53
	time elapsed: 11.88 s
processed 6000/266955 records, 2.25%
	current precision: 0.53
	time elapsed: 12.41 s
processed 7000/266955 records, 2.62%
	current precision: 0.53
	time elapsed: 12.91 s
processed 8000/266955 records, 3.00%
	current precision: 0.53
	time elapsed: 13.44 s
processed 9000/266955 records, 3.37%
	current precision: 0.53
	time elapsed: 13.93 s
processed 10000/266955 records, 3.75%
	current precision: 0.53
	time elapsed: 14.39 s
processed 11000/266955 records, 4.12%
	current precision: 0.53
	time elapsed: 14.86 s
processed 12000/266955 records, 4.50%
	current precision: 0.53
	

processed 97000/266955 records, 36.34%
	current precision: 0.54
	time elapsed: 54.65 s
processed 98000/266955 records, 36.71%
	current precision: 0.54
	time elapsed: 55.26 s
processed 99000/266955 records, 37.08%
	current precision: 0.54
	time elapsed: 55.79 s
processed 100000/266955 records, 37.46%
	current precision: 0.54
	time elapsed: 56.51 s
processed 101000/266955 records, 37.83%
	current precision: 0.54
	time elapsed: 57.24 s
processed 102000/266955 records, 38.21%
	current precision: 0.54
	time elapsed: 57.76 s
processed 103000/266955 records, 38.58%
	current precision: 0.54
	time elapsed: 58.24 s
processed 104000/266955 records, 38.96%
	current precision: 0.54
	time elapsed: 58.69 s
processed 105000/266955 records, 39.33%
	current precision: 0.54
	time elapsed: 59.13 s
processed 106000/266955 records, 39.71%
	current precision: 0.54
	time elapsed: 59.57 s
processed 107000/266955 records, 40.08%
	current precision: 0.54
	time elapsed: 60.03 s
processed 108000/266955 records, 40

processed 192000/266955 records, 71.92%
	current precision: 0.54
	time elapsed: 98.45 s
processed 193000/266955 records, 72.30%
	current precision: 0.54
	time elapsed: 98.90 s
processed 194000/266955 records, 72.67%
	current precision: 0.54
	time elapsed: 99.36 s
processed 195000/266955 records, 73.05%
	current precision: 0.54
	time elapsed: 99.81 s
processed 196000/266955 records, 73.42%
	current precision: 0.54
	time elapsed: 100.27 s
processed 197000/266955 records, 73.80%
	current precision: 0.54
	time elapsed: 100.69 s
processed 198000/266955 records, 74.17%
	current precision: 0.54
	time elapsed: 101.14 s
processed 199000/266955 records, 74.54%
	current precision: 0.54
	time elapsed: 101.58 s
processed 200000/266955 records, 74.92%
	current precision: 0.54
	time elapsed: 102.01 s
processed 201000/266955 records, 75.29%
	current precision: 0.54
	time elapsed: 102.46 s
processed 202000/266955 records, 75.67%
	current precision: 0.54
	time elapsed: 102.91 s
processed 203000/266955 r

{'AveragePrecision': 0.5366797774906006,
 'AverageFilteredPrecision': 1.1218955756691615e-06,
 'perQueryResult':                          QueriedRecordID  Precision@K  SelectedRecords  \
 10000003-TST-MR          10000003-TST-MR         1.00                0   
 10000008-TST-M            10000008-TST-M         1.00                0   
 10000010-TST-CP          10000010-TST-CP         0.05                1   
 10000013-TST-MR          10000013-TST-MR         1.00                0   
 10000016-TST-MR          10000016-TST-MR         1.00                0   
 10000017-TST-MR          10000017-TST-MR         1.00                0   
 10000018-T4-TST-CP    10000018-T4-TST-CP         0.25                5   
 10000018-TST-M            10000018-TST-M         0.25                5   
 10000020-NV0-TST-CP  10000020-NV0-TST-CP         0.15                3   
 10000020-NV1-TST-M    10000020-NV1-TST-M         0.15                3   
 10000022-TST-CP          10000022-TST-CP         0.05         

In [None]:
import sys
sys.path.append("../")

# What is wrong?

In [16]:
len(set(test.linked_id) - set(train.linked_id))

118112

In [39]:
only_test = set(test.linked_id) - set(train.linked_id)

In [17]:
len(set(test.record_id) - set(train.record_id))

264414

In [19]:
sub_l = pd.read_csv("xgb_sub10.csv")
sub_l.predicted_record_id = sub_l.predicted_record_id.str.split(" ")

In [20]:
sub_l = sub_l.merge(test[['record_id', 'linked_id']], how='left', left_on='queried_record_id', right_on='record_id')

In [22]:
sub_l = sub_l.drop('record_id', axis=1)

In [31]:
sub_l['linked_id'] = sub_l.linked_id.astype(int)
int_col = []
for i in sub_l.predicted_record_id:
    int_col.append([int(x) for x in i])
sub_l['predicted_record_id'] = int_col

In [32]:
isin_rec = []
for (p, l) in tqdm(zip(sub_l.predicted_record_id, sub_l.linked_id)):
    if l in (p):
        isin_rec.append(1)
    else:
        isin_rec.append(0)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [33]:
sub_l['isin_rec'] = isin_rec

In [34]:
sub_l

Unnamed: 0,queried_record_id,predicted_record_id,linked_id,isin_rec
0,10000003-TST-MR,"[10010930, 10010930, 10010930, 10010930, 10010...",10000003,0
1,10000008-TST-M,"[10043172, 10043172, 10043172, 10107096, 10079...",10000008,0
2,10000010-TST-CP,"[10000010, 10192342, 10099279, 10099279, 12194...",10000010,1
3,10000013-TST-MR,"[10153043, 10147826, 10147826, 10147826, 10147...",10000013,0
4,10000016-TST-MR,"[10147072, 10147072, 10012410, 10104833, 10113...",10000016,0
5,10000017-TST-MR,"[10094323, 10094323, 10062221, 10062221, 10062...",10000017,0
6,10000018-T4-TST-CP,"[10000018, 10000018, 10000018, 10000018, 10000...",10000018,1
7,10000018-TST-M,"[10000018, 10000018, 10000018, 10000018, 10000...",10000018,1
8,10000020-NV0-TST-CP,"[10000020, 10000020, 10000020, 10151492, 10094...",10000020,1
9,10000020-NV1-TST-M,"[10000020, 10000020, 10000020, 10151492, 10094...",10000020,1


In [37]:
wrong_rec = sub_l[sub_l.isin_rec == 0].queried_record_id.values
wrong_rec

array(['10000003-TST-MR', '10000008-TST-M', '10000013-TST-MR', ...,
       '15005401-TST-MR', '15006601-TST-MR', '15006801-TST-MR'],
      dtype=object)

In [43]:
in_train_test = test[~test.linked_id.isin(only_test)]

In [48]:
wrong_in_train_test = in_train_test[in_train_test.record_id.isin(wrong_rec)]
wrong_in_train_test.name = wrong_in_train_test.name.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [49]:
mask = (wrong_in_train_test.address.isnull()) & (wrong_in_train_test.phone.isnull()) & (wrong_in_train_test.email.isnull())
wrong_in_train_test[mask]

Unnamed: 0,record_id,name,type,address,phone,email,modification,linked_id
106093,10149052-TST-M,kellera group s.a.,entity,,,,move row,10149052
106159,12017082-T3-TST-M,fenchurch trust limited,officer,,,,move row,12017082
106171,12183327-M0-TST-M,led services establishment,officer,,,,move row,12183327
106858,13009625-M2-TST-M,tay yew beng peter,officer,,,,move row,13009625
106876,12024667-TST-M,the bearer,officer,,,,move row,12024667
107912,12182451-M0-TST-M,minerva services limited,officer,,,,move row,12182451
108042,10054881-T0-TST-M,fowpe inc.,entity,,,,move row,10054881
108191,12159348-M1-TST-M,cannon nominees limited,officer,,,,move row,12159348
108200,10097444-T0-TST-M,smuaqe max holdings limitcd,entity,,,,move row,10097444
108527,12211818-M1-TST-M,yee hope investments limited,officer,,,,move row,12211818


In [52]:
wrong_in_train_test[(wrong_in_train_test.name.str.contains('bearer')) & (mask)]

Unnamed: 0,record_id,name,type,address,phone,email,modification,linked_id
106876,12024667-TST-M,the bearer,officer,,,,move row,12024667
110114,12066454-NV2-TST-M,bearer,officer,,,,move row,12066454
110375,12059186-M1-TST-M,to the bearer,officer,,,,move row,12059186
125765,12031088-T1-TST-M,the bearer,officer,,,,move row,12031088
125887,12086984-M1-TST-M,bearer,officer,,,,move row,12086984
128899,12025501-M1-TST-M,bearer,officer,,,,move row,12025501
129012,12073071-T2-TST-M,bearer,officer,,,,move row,12073071
132960,12106494-M2-TST-M,bearer,officer,,,,move row,12106494
140197,12084118-TST-M,bearer,officer,,,,move row,12084118
140282,12014031-M1-TST-M,bearer,officer,,,,move row,12014031


In [56]:
only_in_test = test[test.linked_id.isin(only_test)]
only_in_test.name = only_in_test.name.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [60]:
mask2= (only_in_test.address.isnull()) & (only_in_test.phone.isnull()) & (only_in_test.email.isnull())
only_in_test[mask2 & (only_in_test.name.str.contains('bearer'))]

Unnamed: 0,record_id,name,type,address,phone,email,modification,linked_id
1460,12088351-TST-MR,the bearer,officer,,,,move unique,12088351
2590,12028515-TST-MR,the bearer:agent intersun limited,officer,,,,move unique,12028515
4734,12048724-TST-MR,bearer,officer,,,,move unique,12048724
6245,12048553-TST-MR,bearer,officer,,,,move unique,12048553
8778,12025496-TST-MR,"the, bearer",officer,,,,move unique,12025496
11066,12013302-TST-MR,bearer,officer,,,,move unique,12013302
11576,12044178-TST-MR,bearer,officer,,,,move unique,12044178
18921,12059343-TST-MR,to the bearer,officer,,,,move unique,12059343
19917,12042656-TST-MR,bearer,officer,,,,move unique,12042656
21179,12038063-TST-MR,bearer,officer,,,,move unique,12038063


In [None]:
# TODO guardare lo score che viene fuori da quelli che sbagliamo

In [99]:
df_pred = pd.read_csv("test_xgb_10_scores.csv")

In [100]:
df_pred

Unnamed: 0,queried_record_id,predicted_record_id,predicted_record_id_record,cosine_score,name_cosine,email_cosine,phone_cosine,linked_id_idx,editdistance,jaro_winkler,...,name_popularity,null_address,perc_non_null_address,null_email,perc_non_null_email,null_phone,perc_non_null_phone,phone_popularity,test_name_length,predictions
0,10000003-TST-MR,10010930,10010930-NV0,0.584588,0.584588,0.0,0.0,19656,14,0.687500,...,1,4,33,6,0,0,100,1,18,-4.530108
1,10000003-TST-MR,10010930,10010930,0.584588,0.584588,0.0,0.0,19655,14,0.687500,...,1,4,33,6,0,0,100,1,18,-4.530108
2,10000003-TST-MR,10010930,10010930-NV1,0.584588,0.584588,0.0,0.0,19657,14,0.687500,...,1,4,33,6,0,0,100,1,18,-4.530108
3,10000003-TST-MR,10010930,10010930-T1,0.584588,0.584588,0.0,0.0,19659,14,0.687500,...,1,4,33,6,0,0,100,1,18,-4.530108
4,10000003-TST-MR,10010930,10010930-T2,0.566679,0.566679,0.0,0.0,19660,14,0.706019,...,1,4,33,6,0,0,100,1,18,-4.530108
5,10000003-TST-MR,10010930,10010930-T0,0.533263,0.533263,0.0,0.0,19658,14,0.687500,...,1,4,33,6,0,0,100,1,18,-4.530108
6,10000003-TST-MR,10027883,10027883,0.418555,0.418555,0.0,0.0,49967,12,0.682828,...,1,2,0,2,0,2,0,1,18,-4.608578
7,10000003-TST-MR,10051985,10051985,0.399978,0.399978,0.0,0.0,92346,11,0.582621,...,1,1,0,0,100,1,0,1,18,-4.688859
8,10000003-TST-MR,10173968,10173968-T1,0.399978,0.399978,0.0,0.0,309898,11,0.582621,...,1,3,0,0,100,0,100,1,18,-4.610390
9,10000003-TST-MR,10173968,10173968-T0,0.399978,0.399978,0.0,0.0,309897,11,0.582621,...,1,3,0,0,100,0,100,1,18,-4.610390


In [101]:
df_pred['linked_id'] = df_pred.queried_record_id.str.split("-")
df_pred.linked_id = df_pred.linked_id.apply(lambda x: x[0])

In [102]:
df_pred

Unnamed: 0,queried_record_id,predicted_record_id,predicted_record_id_record,cosine_score,name_cosine,email_cosine,phone_cosine,linked_id_idx,editdistance,jaro_winkler,...,null_address,perc_non_null_address,null_email,perc_non_null_email,null_phone,perc_non_null_phone,phone_popularity,test_name_length,predictions,linked_id
0,10000003-TST-MR,10010930,10010930-NV0,0.584588,0.584588,0.0,0.0,19656,14,0.687500,...,4,33,6,0,0,100,1,18,-4.530108,10000003
1,10000003-TST-MR,10010930,10010930,0.584588,0.584588,0.0,0.0,19655,14,0.687500,...,4,33,6,0,0,100,1,18,-4.530108,10000003
2,10000003-TST-MR,10010930,10010930-NV1,0.584588,0.584588,0.0,0.0,19657,14,0.687500,...,4,33,6,0,0,100,1,18,-4.530108,10000003
3,10000003-TST-MR,10010930,10010930-T1,0.584588,0.584588,0.0,0.0,19659,14,0.687500,...,4,33,6,0,0,100,1,18,-4.530108,10000003
4,10000003-TST-MR,10010930,10010930-T2,0.566679,0.566679,0.0,0.0,19660,14,0.706019,...,4,33,6,0,0,100,1,18,-4.530108,10000003
5,10000003-TST-MR,10010930,10010930-T0,0.533263,0.533263,0.0,0.0,19658,14,0.687500,...,4,33,6,0,0,100,1,18,-4.530108,10000003
6,10000003-TST-MR,10027883,10027883,0.418555,0.418555,0.0,0.0,49967,12,0.682828,...,2,0,2,0,2,0,1,18,-4.608578,10000003
7,10000003-TST-MR,10051985,10051985,0.399978,0.399978,0.0,0.0,92346,11,0.582621,...,1,0,0,100,1,0,1,18,-4.688859,10000003
8,10000003-TST-MR,10173968,10173968-T1,0.399978,0.399978,0.0,0.0,309898,11,0.582621,...,3,0,0,100,0,100,1,18,-4.610390,10000003
9,10000003-TST-MR,10173968,10173968-T0,0.399978,0.399978,0.0,0.0,309897,11,0.582621,...,3,0,0,100,0,100,1,18,-4.610390,10000003


In [109]:
df_pred.linked_id = df_pred.linked_id.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [103]:
df_pred = df_pred[['queried_record_id', 'predicted_record_id', 'linked_id', 'predictions']]

In [113]:
df_pred_only_test = df_pred[df_pred.linked_id.isin(only_test)]

In [116]:
df_pred_only_test[df_pred_only_test.predictions > 0].drop_duplicates('queried_record_id')

Unnamed: 0,queried_record_id,predicted_record_id,linked_id,predictions
20,10000008-TST-M,10043172,10000008,2.358939
80,10000016-TST-MR,10147072,10000016,0.212162
240,10000024-TST-MR,10100649,10000024,2.981995
260,10000025-TST-MR,10038568,10000025,3.443136
360,10000030-TST-MR,10037339,10000030,2.981995
440,10000034-TST-M,10071366,10000034,0.177727
544,10000037-TST-MR,10115101,10000037,0.718337
620,10000041-TST-MR,10057787,10000041,2.886820
640,10000042-TST-MR,10214239,10000042,3.538311
661,10000043-TST-MR,10029219,10000043,0.177727


In [120]:
sub_local = sub
sub_local.predicted_record_id = sub_local.queried_record_id.str.split("-")
sub_local.predicted_record_id = sub_local.predicted_record_id.apply(lambda x: x[0])
sub_local

Unnamed: 0,queried_record_id,predicted_record_id
0,10000003-TST-MR,10000003
1,10000003-TST-MR,10000003
2,10000003-TST-MR,10000003
3,10000003-TST-MR,10000003
4,10000003-TST-MR,10000003
5,10000003-TST-MR,10000003
6,10000003-TST-MR,10000003
7,10000003-TST-MR,10000003
8,10000003-TST-MR,10000003
9,10000003-TST-MR,10000003


In [121]:
precision_at_k(sub_local, train.set_index('record_id'), test.set_index('record_id'))

processed 1000/266955 records, 0.37%
	current precision: 0.90
	time elapsed: 10.10 s
processed 2000/266955 records, 0.75%
	current precision: 0.91
	time elapsed: 10.57 s
processed 3000/266955 records, 1.12%
	current precision: 0.91
	time elapsed: 11.04 s
processed 4000/266955 records, 1.50%
	current precision: 0.91
	time elapsed: 11.55 s
processed 5000/266955 records, 1.87%
	current precision: 0.91
	time elapsed: 12.03 s
processed 6000/266955 records, 2.25%
	current precision: 0.91
	time elapsed: 12.50 s
processed 7000/266955 records, 2.62%
	current precision: 0.91
	time elapsed: 12.97 s
processed 8000/266955 records, 3.00%
	current precision: 0.91
	time elapsed: 13.47 s
processed 9000/266955 records, 3.37%
	current precision: 0.91
	time elapsed: 13.94 s
processed 10000/266955 records, 3.75%
	current precision: 0.91
	time elapsed: 14.49 s
processed 11000/266955 records, 4.12%
	current precision: 0.91
	time elapsed: 15.20 s
processed 12000/266955 records, 4.50%
	current precision: 0.91


processed 97000/266955 records, 36.34%
	current precision: 0.91
	time elapsed: 67.30 s
processed 98000/266955 records, 36.71%
	current precision: 0.91
	time elapsed: 67.77 s
processed 99000/266955 records, 37.08%
	current precision: 0.91
	time elapsed: 68.29 s
processed 100000/266955 records, 37.46%
	current precision: 0.91
	time elapsed: 68.79 s
processed 101000/266955 records, 37.83%
	current precision: 0.91
	time elapsed: 69.26 s
processed 102000/266955 records, 38.21%
	current precision: 0.91
	time elapsed: 69.75 s
processed 103000/266955 records, 38.58%
	current precision: 0.91
	time elapsed: 70.24 s
processed 104000/266955 records, 38.96%
	current precision: 0.91
	time elapsed: 70.76 s
processed 105000/266955 records, 39.33%
	current precision: 0.91
	time elapsed: 71.24 s
processed 106000/266955 records, 39.71%
	current precision: 0.91
	time elapsed: 71.70 s
processed 107000/266955 records, 40.08%
	current precision: 0.91
	time elapsed: 72.16 s
processed 108000/266955 records, 40

processed 190000/266955 records, 71.17%
	current precision: 0.91
	time elapsed: 117.92 s
processed 191000/266955 records, 71.55%
	current precision: 0.91
	time elapsed: 118.61 s
processed 192000/266955 records, 71.92%
	current precision: 0.91
	time elapsed: 119.40 s
processed 193000/266955 records, 72.30%
	current precision: 0.91
	time elapsed: 120.20 s
processed 194000/266955 records, 72.67%
	current precision: 0.91
	time elapsed: 121.37 s
processed 195000/266955 records, 73.05%
	current precision: 0.91
	time elapsed: 121.96 s
processed 196000/266955 records, 73.42%
	current precision: 0.91
	time elapsed: 122.49 s
processed 197000/266955 records, 73.80%
	current precision: 0.91
	time elapsed: 123.17 s
processed 198000/266955 records, 74.17%
	current precision: 0.91
	time elapsed: 124.04 s
processed 199000/266955 records, 74.54%
	current precision: 0.91
	time elapsed: 124.50 s
processed 200000/266955 records, 74.92%
	current precision: 0.91
	time elapsed: 125.34 s
processed 201000/2669

{'AveragePrecision': 0.9087074600588114,
 'AverageFilteredPrecision': 5.632242003740936e-06,
 'perQueryResult':                          QueriedRecordID  Precision@K  SelectedRecords  \
 10000003-TST-MR          10000003-TST-MR          1.0                0   
 10000008-TST-M            10000008-TST-M          1.0                0   
 10000010-TST-CP          10000010-TST-CP          1.0                1   
 10000013-TST-MR          10000013-TST-MR          1.0                0   
 10000016-TST-MR          10000016-TST-MR          1.0                0   
 10000017-TST-MR          10000017-TST-MR          1.0                0   
 10000018-T4-TST-CP    10000018-T4-TST-CP          0.0                0   
 10000018-TST-M            10000018-TST-M          0.0                0   
 10000020-NV0-TST-CP  10000020-NV0-TST-CP          1.0                1   
 10000020-NV1-TST-M    10000020-NV1-TST-M          1.0                1   
 10000022-TST-CP          10000022-TST-CP          1.0          

# Recall e Precision con i RECORD_ID

In [8]:
group_linked = train[['record_id', 'linked_id']].groupby('linked_id').apply(lambda x: list(x['record_id']))
group_linked = group_linked.reset_index().rename(columns={0:'record_id'})
group_linked

Unnamed: 0,linked_id,record_id
0,10000001,[10000001]
1,10000002,"[10000002, 10000002-M1, 10000002-M0]"
2,10000004,"[10000004, 10000004-M0, 10000004-M1]"
3,10000005,[10000005]
4,10000006,[10000006]
5,10000007,[10000007]
6,10000009,"[10000009, 10000009-NV0]"
7,10000010,[10000010]
8,10000011,[10000011]
9,10000012,[10000012]


In [9]:
group_linked.linked_id = group_linked.linked_id.astype(int)

In [10]:
test = test.sort_values(by='record_id')

In [11]:
test.linked_id = test.linked_id.astype(int)

In [12]:
test = test.merge(group_linked, how='left', on='linked_id')

In [13]:
test = test.rename(columns={'record_id_x': 'record_id'})

In [18]:
test_max = test[['record_id', 'linked_id', 'record_id_y']]
test_max

Unnamed: 0,record_id,linked_id,record_id_y
0,10000003-TST-MR,10000003,
1,10000008-TST-M,10000008,
2,10000010-TST-CP,10000010,[10000010]
3,10000013-TST-MR,10000013,
4,10000016-TST-MR,10000016,
5,10000017-TST-MR,10000017,
6,10000018-T4-TST-CP,10000018,"[10000018-T3, 10000018-T0, 10000018-T4, 100000..."
7,10000018-TST-M,10000018,"[10000018-T3, 10000018-T0, 10000018-T4, 100000..."
8,10000020-NV0-TST-CP,10000020,"[10000020, 10000020-NV2, 10000020-NV0]"
9,10000020-NV1-TST-M,10000020,"[10000020, 10000020-NV2, 10000020-NV0]"


In [47]:
new_col = []
for (r, i) in zip(test_max.record_id, test_max.record_id_y):
    if not isinstance(i,list):
        new_col.append([r])
    else:
        new_col.append(i)
        
test_max['predicted_record_id'] = new_col
test_max

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,record_id,linked_id,record_id_y,predicted_record_id
0,10000003-TST-MR,10000003,,[10000003-TST-MR]
1,10000008-TST-M,10000008,,[10000008-TST-M]
2,10000010-TST-CP,10000010,[10000010],[10000010]
3,10000013-TST-MR,10000013,,[10000013-TST-MR]
4,10000016-TST-MR,10000016,,[10000016-TST-MR]
5,10000017-TST-MR,10000017,,[10000017-TST-MR]
6,10000018-T4-TST-CP,10000018,"[10000018-T3, 10000018-T0, 10000018-T4, 100000...","[10000018-T3, 10000018-T0, 10000018-T4, 100000..."
7,10000018-TST-M,10000018,"[10000018-T3, 10000018-T0, 10000018-T4, 100000...","[10000018-T3, 10000018-T0, 10000018-T4, 100000..."
8,10000020-NV0-TST-CP,10000020,"[10000020, 10000020-NV2, 10000020-NV0]","[10000020, 10000020-NV2, 10000020-NV0]"
9,10000020-NV1-TST-M,10000020,"[10000020, 10000020-NV2, 10000020-NV0]","[10000020, 10000020-NV2, 10000020-NV0]"


In [48]:
new_df = []
for (r, p) in tqdm(zip(test_max.record_id, test_max.predicted_record_id)):
    for x in p:
        new_df.append((r,x))

df_new = pd.DataFrame(new_df, columns=['queried_record_id', 'predicted_record_id'])
df_new

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,queried_record_id,predicted_record_id
0,10000003-TST-MR,10000003-TST-MR
1,10000008-TST-M,10000008-TST-M
2,10000010-TST-CP,10000010
3,10000013-TST-MR,10000013-TST-MR
4,10000016-TST-MR,10000016-TST-MR
5,10000017-TST-MR,10000017-TST-MR
6,10000018-T4-TST-CP,10000018-T3
7,10000018-T4-TST-CP,10000018-T0
8,10000018-T4-TST-CP,10000018-T4
9,10000018-T4-TST-CP,10000018-T1


In [49]:
precision_at_k(df_new, train.set_index('record_id'), test.set_index('record_id'))

processed 1000/266955 records, 0.37%
	current precision: 1.00
	time elapsed: 7.65 s
processed 2000/266955 records, 0.75%
	current precision: 1.00
	time elapsed: 7.94 s
processed 3000/266955 records, 1.12%
	current precision: 1.00
	time elapsed: 8.25 s
processed 4000/266955 records, 1.50%
	current precision: 1.00
	time elapsed: 8.55 s
processed 5000/266955 records, 1.87%
	current precision: 1.00
	time elapsed: 8.85 s
processed 6000/266955 records, 2.25%
	current precision: 1.00
	time elapsed: 9.16 s
processed 7000/266955 records, 2.62%
	current precision: 1.00
	time elapsed: 9.46 s
processed 8000/266955 records, 3.00%
	current precision: 1.00
	time elapsed: 9.76 s
processed 9000/266955 records, 3.37%
	current precision: 1.00
	time elapsed: 10.06 s
processed 10000/266955 records, 3.75%
	current precision: 1.00
	time elapsed: 10.35 s
processed 11000/266955 records, 4.12%
	current precision: 1.00
	time elapsed: 10.65 s
processed 12000/266955 records, 4.50%
	current precision: 1.00
	time el

	current precision: 1.00
	time elapsed: 64.16 s
processed 189000/266955 records, 70.80%
	current precision: 1.00
	time elapsed: 64.45 s
processed 190000/266955 records, 71.17%
	current precision: 1.00
	time elapsed: 64.76 s
processed 191000/266955 records, 71.55%
	current precision: 1.00
	time elapsed: 65.05 s
processed 192000/266955 records, 71.92%
	current precision: 1.00
	time elapsed: 65.35 s
processed 193000/266955 records, 72.30%
	current precision: 1.00
	time elapsed: 65.65 s
processed 194000/266955 records, 72.67%
	current precision: 1.00
	time elapsed: 65.95 s
processed 195000/266955 records, 73.05%
	current precision: 1.00
	time elapsed: 66.25 s
processed 196000/266955 records, 73.42%
	current precision: 1.00
	time elapsed: 66.56 s
processed 197000/266955 records, 73.80%
	current precision: 1.00
	time elapsed: 66.84 s
processed 198000/266955 records, 74.17%
	current precision: 1.00
	time elapsed: 67.14 s
processed 199000/266955 records, 74.54%
	current precision: 1.00
	time e

{'AveragePrecision': 1.0,
 'AverageFilteredPrecision': 6.739043999218271e-06,
 'perQueryResult':                          QueriedRecordID  Precision@K  SelectedRecords  \
 10000003-TST-MR          10000003-TST-MR          1.0                0   
 10000008-TST-M            10000008-TST-M          1.0                0   
 10000010-TST-CP          10000010-TST-CP          1.0                1   
 10000013-TST-MR          10000013-TST-MR          1.0                0   
 10000016-TST-MR          10000016-TST-MR          1.0                0   
 10000017-TST-MR          10000017-TST-MR          1.0                0   
 10000018-T4-TST-CP    10000018-T4-TST-CP          1.0                5   
 10000018-TST-M            10000018-TST-M          1.0                5   
 10000020-NV0-TST-CP  10000020-NV0-TST-CP          1.0                3   
 10000020-NV1-TST-M    10000020-NV1-TST-M          1.0                3   
 10000022-TST-CP          10000022-TST-CP          1.0                1   
 100

In [14]:
recall_at_k(sub, train.set_index('record_id'), test.set_index('record_id'))

processed 1000/266955 records, 0.37%
	current recall: 0.88
	time elapsed: 7.56 s
processed 2000/266955 records, 0.75%
	current recall: 0.88
	time elapsed: 7.86 s
processed 3000/266955 records, 1.12%
	current recall: 0.88
	time elapsed: 8.17 s
processed 4000/266955 records, 1.50%
	current recall: 0.88
	time elapsed: 8.47 s
processed 5000/266955 records, 1.87%
	current recall: 0.89
	time elapsed: 8.76 s
processed 6000/266955 records, 2.25%
	current recall: 0.89
	time elapsed: 9.06 s
processed 7000/266955 records, 2.62%
	current recall: 0.89
	time elapsed: 9.35 s
processed 8000/266955 records, 3.00%
	current recall: 0.89
	time elapsed: 9.65 s
processed 9000/266955 records, 3.37%
	current recall: 0.89
	time elapsed: 9.95 s
processed 10000/266955 records, 3.75%
	current recall: 0.89
	time elapsed: 10.24 s
processed 11000/266955 records, 4.12%
	current recall: 0.89
	time elapsed: 10.53 s
processed 12000/266955 records, 4.50%
	current recall: 0.89
	time elapsed: 10.82 s
processed 13000/266955

processed 195000/266955 records, 73.05%
	current recall: 0.89
	time elapsed: 64.96 s
processed 196000/266955 records, 73.42%
	current recall: 0.89
	time elapsed: 65.26 s
processed 197000/266955 records, 73.80%
	current recall: 0.89
	time elapsed: 65.54 s
processed 198000/266955 records, 74.17%
	current recall: 0.89
	time elapsed: 65.83 s
processed 199000/266955 records, 74.54%
	current recall: 0.89
	time elapsed: 66.12 s
processed 200000/266955 records, 74.92%
	current recall: 0.89
	time elapsed: 66.41 s
processed 201000/266955 records, 75.29%
	current recall: 0.89
	time elapsed: 66.71 s
processed 202000/266955 records, 75.67%
	current recall: 0.88
	time elapsed: 67.01 s
processed 203000/266955 records, 76.04%
	current recall: 0.89
	time elapsed: 67.31 s
processed 204000/266955 records, 76.42%
	current recall: 0.89
	time elapsed: 67.63 s
processed 205000/266955 records, 76.79%
	current recall: 0.88
	time elapsed: 67.95 s
processed 206000/266955 records, 77.17%
	current recall: 0.89
	ti

{'AverageRecall': 0.8837007697817886,
 'AverageFilteredRecall': 0.7907751854727372,
 'perQueryResult':                          QueriedRecordID  Recall@K  SelectedRecords  \
 10000003-TST-MR          10000003-TST-MR  1.000000                0   
 10000008-TST-M            10000008-TST-M  1.000000                0   
 10000010-TST-CP          10000010-TST-CP  1.000000                1   
 10000013-TST-MR          10000013-TST-MR  1.000000                0   
 10000016-TST-MR          10000016-TST-MR  1.000000                0   
 10000017-TST-MR          10000017-TST-MR  1.000000                0   
 10000018-T4-TST-CP    10000018-T4-TST-CP  0.800000                4   
 10000018-TST-M            10000018-TST-M  0.800000                4   
 10000020-NV0-TST-CP  10000020-NV0-TST-CP  1.000000                3   
 10000020-NV1-TST-M    10000020-NV1-TST-M  0.333333                1   
 10000022-TST-CP          10000022-TST-CP  1.000000                1   
 10000023-NV2-TST-CP  10000023-NV2

In [12]:
results = precision_at_k(sub, train.set_index('record_id'), test.set_index('record_id'))

processed 1000/266955 records, 0.37%
	current precision: 0.60
	time elapsed: 7.84 s
processed 2000/266955 records, 0.75%
	current precision: 0.62
	time elapsed: 8.14 s
processed 3000/266955 records, 1.12%
	current precision: 0.62
	time elapsed: 8.45 s
processed 4000/266955 records, 1.50%
	current precision: 0.62
	time elapsed: 8.75 s
processed 5000/266955 records, 1.87%
	current precision: 0.63
	time elapsed: 9.05 s
processed 6000/266955 records, 2.25%
	current precision: 0.62
	time elapsed: 9.37 s
processed 7000/266955 records, 2.62%
	current precision: 0.63
	time elapsed: 9.67 s
processed 8000/266955 records, 3.00%
	current precision: 0.62
	time elapsed: 9.98 s
processed 9000/266955 records, 3.37%
	current precision: 0.63
	time elapsed: 10.28 s
processed 10000/266955 records, 3.75%
	current precision: 0.63
	time elapsed: 10.58 s
processed 11000/266955 records, 4.12%
	current precision: 0.63
	time elapsed: 10.89 s
processed 12000/266955 records, 4.50%
	current precision: 0.63
	time el

	current precision: 0.63
	time elapsed: 36.38 s
processed 96000/266955 records, 35.96%
	current precision: 0.63
	time elapsed: 36.69 s
processed 97000/266955 records, 36.34%
	current precision: 0.63
	time elapsed: 36.99 s
processed 98000/266955 records, 36.71%
	current precision: 0.63
	time elapsed: 37.29 s
processed 99000/266955 records, 37.08%
	current precision: 0.63
	time elapsed: 37.59 s
processed 100000/266955 records, 37.46%
	current precision: 0.63
	time elapsed: 37.90 s
processed 101000/266955 records, 37.83%
	current precision: 0.63
	time elapsed: 38.20 s
processed 102000/266955 records, 38.21%
	current precision: 0.63
	time elapsed: 38.51 s
processed 103000/266955 records, 38.58%
	current precision: 0.63
	time elapsed: 38.80 s
processed 104000/266955 records, 38.96%
	current precision: 0.63
	time elapsed: 39.10 s
processed 105000/266955 records, 39.33%
	current precision: 0.63
	time elapsed: 39.40 s
processed 106000/266955 records, 39.71%
	current precision: 0.63
	time elaps

	current precision: 0.63
	time elapsed: 64.38 s
processed 189000/266955 records, 70.80%
	current precision: 0.63
	time elapsed: 64.68 s
processed 190000/266955 records, 71.17%
	current precision: 0.63
	time elapsed: 65.32 s
processed 191000/266955 records, 71.55%
	current precision: 0.63
	time elapsed: 65.63 s
processed 192000/266955 records, 71.92%
	current precision: 0.63
	time elapsed: 65.93 s
processed 193000/266955 records, 72.30%
	current precision: 0.63
	time elapsed: 66.23 s
processed 194000/266955 records, 72.67%
	current precision: 0.63
	time elapsed: 66.53 s
processed 195000/266955 records, 73.05%
	current precision: 0.63
	time elapsed: 66.84 s
processed 196000/266955 records, 73.42%
	current precision: 0.63
	time elapsed: 67.15 s
processed 197000/266955 records, 73.80%
	current precision: 0.63
	time elapsed: 67.44 s
processed 198000/266955 records, 74.17%
	current precision: 0.63
	time elapsed: 67.73 s
processed 199000/266955 records, 74.54%
	current precision: 0.63
	time e

In [19]:
results

{'AveragePrecision': 0.980730268432966,
 'AverageFilteredPrecision': 6.173413900157085e-06,
 'perQueryResult':                          QueriedRecordID  Precision@K  SelectedRecords  \
 10000003-TST-MR          10000003-TST-MR          1.0                0   
 10000008-TST-M            10000008-TST-M          1.0                0   
 10000010-TST-CP          10000010-TST-CP          1.0                1   
 10000013-TST-MR          10000013-TST-MR          1.0                0   
 10000016-TST-MR          10000016-TST-MR          1.0                0   
 10000017-TST-MR          10000017-TST-MR          1.0                0   
 10000018-T4-TST-CP    10000018-T4-TST-CP          1.0                4   
 10000018-TST-M            10000018-TST-M          1.0                4   
 10000020-NV0-TST-CP  10000020-NV0-TST-CP          1.0                3   
 10000020-NV1-TST-M    10000020-NV1-TST-M          1.0                1   
 10000022-TST-CP          10000022-TST-CP          1.0           

In [17]:
results['perQueryResult']['Precision@K'].sum() / 266955

0.6321945646269967

In [23]:
sub[sub['queried_record_id'] == '10000018-T4-TST-CP']

Unnamed: 0,queried_record_id,predicted_record_id
60,10000018-T4-TST-CP,10000018-T0
61,10000018-T4-TST-CP,10000018-T4
62,10000018-T4-TST-CP,10000018-T3
63,10000018-T4-TST-CP,10000018-T1
64,10000018-T4-TST-CP,10000018-T2
65,10000018-T4-TST-CP,10000069-M0
66,10000018-T4-TST-CP,10000069-M1
67,10000018-T4-TST-CP,10000069
68,10000018-T4-TST-CP,10002031
69,10000018-T4-TST-CP,12180349-M0
