In [1]:
from tqdm import tqdm
from scipy import *
from scipy.sparse import *
import pandas as pd

def get_sub(sim, df_train, df_test, sub_name='mimmo'):
    """
    This function generates a submission-style pandas dataframe from the similarity
    and writes the dataframe to a csv file named as the sub_name parameter
    : param sim : similarity in CSR format
    : param df_train : the train pandas dataframe
    : param df_test : the test pandas dataframe
    : param sub_name : the name of the file of the submission
    : return : the pandas dataframe
    """
    # first make sure df_train and df_test are sorted by record_id
    print("Sorting dataframes...")
    df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
    df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)
    # then extract top indices sorting
    print("Sorting similarity to get top indices...")
    indices = []
    for x in tqdm(range(sim.shape[0])):
        if x == 0:
            indices.append(sim[x].nonzero()[1][sim[x].data[1:].argsort()[::-1]])
        else:
            indices.append(sim[x].nonzero()[1][sim[x].data.argsort()[::-1]])
    linked_id_list = []
    num_diff_lin_id = 10
    # use indices wrt to loc, much more faster
    # avoid drop_duplicates, simply check whether the linked_id is already in the list
    dict_index_linked_id =dict(zip(df_train.index, df_train.linked_id))
    print("Retrieving linked ids from df_train...")
    for x in tqdm(indices):
        tmp = []
        for l in x:
            if len(tmp)<num_diff_lin_id:
                ind = dict_index_linked_id[l]
                if ind not in tmp:
                    tmp.append(ind)
            else:
                continue
        linked_id_list.append(tmp)
    # the create sub
    print("Creating the sub...")
    sub = pd.DataFrame()
    sub['queried_record_id'] = df_test.record_id
    sub['predicted_record_id'] = linked_id_list
    print('Exploding list to string...')
    strings = []
    for t in tqdm(sub.predicted_record_id):
        strings.append(' '.join([str(x) for x in t]))
    sub['predicted_record_id'] = strings
    print(f"Writing to {sub_name}.csv...")
    sub.to_csv(f'{sub_name}.csv', index=False)
    print('DONE!')
    return sub


In [26]:
s1 = load_npz('../jaccard_tfidf_test_train_300.npz')
s2 = load_npz('../jaccard_tfidf_phone.npz')
s3 = load_npz('../jaccard_tfidf_email.npz')
s4 = load_npz('../jaccard_tfidf_3ngrams_500k.npz')
s5 = load_npz('../jaccard_tfidf_phone_500k.npz')
s = s4+0.8*s5
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)

In [27]:
ss = get_sub(s, df_train, df_test, 'name_08phone500') 

Sorting dataframes...
Sorting similarity to get top indices...


100%|█████████████████████████████████████████████████████████████████| 266955/266955 [00:56<00:00, 4716.04it/s]


Retrieving linked ids from df_train...


100%|█████████████████████████████████████████████████████████████████| 266955/266955 [00:46<00:00, 5766.80it/s]


Creating the sub...
Exploding list to string...


100%|███████████████████████████████████████████████████████████████| 266955/266955 [00:00<00:00, 328587.58it/s]


Writing to name_08phone500.csv...
DONE!


In [2]:
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)

In [3]:
df_test.head()

Unnamed: 0,record_id,name,type,address,phone,email,modification
0,10000003-TST-MR,"HOTFOCUS CO., LTD.",entity,,440157400000.0,consulting@outlook.ch,move unique
1,10000008-TST-M,BONUS TRADE LIMITED,entity,,,help@gmail.gov,move row
2,10000010-TST-CP,NEW IDEA LIMITED,entity,,19124690000.0,,identical copy
3,10000013-TST-MR,VICTORY GROUP LIMITED,entity,,19495060000.0,,move unique
4,10000016-TST-MR,"NINGBO RAPID INTERNATIONAL TRADING CO., LTD.",entity,,444651200000.0,info@outlook.cz,move unique


In [4]:
df_test['real_linked_id'] = [str(x).split('-')[0] for x in df_test.record_id]

In [5]:
df_test.head()

Unnamed: 0,record_id,name,type,address,phone,email,modification,real_linked_id
0,10000003-TST-MR,"HOTFOCUS CO., LTD.",entity,,440157400000.0,consulting@outlook.ch,move unique,10000003
1,10000008-TST-M,BONUS TRADE LIMITED,entity,,,help@gmail.gov,move row,10000008
2,10000010-TST-CP,NEW IDEA LIMITED,entity,,19124690000.0,,identical copy,10000010
3,10000013-TST-MR,VICTORY GROUP LIMITED,entity,,19495060000.0,,move unique,10000013
4,10000016-TST-MR,"NINGBO RAPID INTERNATIONAL TRADING CO., LTD.",entity,,444651200000.0,info@outlook.cz,move unique,10000016


In [6]:
df_test.real_linked_id = df_test.real_linked_id.astype(str) 
df_train.real_linked_id = df_train.linked_id.astype(str) 

  


In [7]:
len(set(df_test.real_linked_id) - set(df_train.real_linked_id))

118112

In [11]:
len(set(df_test.real_linked_id) - set(df_train.real_linked_id)) * 100 / df_test.shape[0]

44.24416100091776