In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [2]:
from scipy import *
from scipy.sparse import *
import similaripy as sim
from sklearn.feature_extraction.text import CountVectorizer
import re

In [3]:
import lightgbm as lgb
import time

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Prendiamo un elemento alla volta dal test, facciamo la similarità di quell'elemento ed estraiamo il top50. Quell'elemento poi verrà predetto. Una volta tirata fuori la predizione di quell'elemento, va aggiunto al train [di LightGBM]: non importa rifare la similarità perché valutando un elemento alla volta sappiamo già dalla similarità che facciamo se quell'elemento è nel train o no. 

In [4]:
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")
df_train = df_train.sort_values(by='record_id').reset_index(drop=True)
df_test = df_test.sort_values(by='record_id').reset_index(drop=True)

df_train.linked_id = df_train.linked_id.astype(int)

In [5]:
df_test['linked_id'] = df_test.record_id.str.split("-")
df_test['linked_id'] = df_test.linked_id.apply(lambda x: x[0])
df_test.linked_id = df_test.linked_id.astype(int)
#df_train.linked_id = df_train.linked_id.astype(int)
only_test = set(df_test.linked_id.values) - set(df_train.linked_id.values)
only_test_recordid = df_test[df_test.linked_id.isin(only_test)]
df_test = df_test.drop('linked_id', axis=1)

In [6]:
train1 = pd.read_csv("../dataset/validation_2/train_complete.csv")
train2 = pd.read_csv("../dataset/validation_3/train_complete.csv")
val = pd.read_csv("../dataset/validation/train_complete.csv")

In [7]:
def remove_spaces(s, n=3):
    s = re.sub(' +',' ',s).strip()
    ngrams = zip(*[s[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [8]:
def ngrams_name(test_record, df_train):
    df_train.name = df_train.name.astype(str)
    #test_record['name'] = test_record['name'].astype(str)
    corpus = list(df_train.name)
    corpus.append(test_record['name'])
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [9]:
def ngrams_address(test_record, df_train):
    df_train.address = df_train.address.fillna('').astype(str)
    test_record.address = test_record.fillna({'address':''}).address
    corpus = list(df_train.address)
    corpus.append(test_record.address)
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [10]:
def ngrams_email(test_record, df_train):
    df_train.email = df_train.email.fillna('').astype(str)
    test_record.email = test_record.fillna({'email':''}).email
    corpus = list(df_train.email) 
    corpus.append(test_record.email)
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [11]:
def convert_phones(df_in):
    """
    This functions transforms the phone column from scientific notation to readable string
    format, e.g. 1.2933+E10 to 12933000000
    : param df_in : the original df with the phone in scientific notation
    : return : the clean df
    """
    df = df_in.copy()
    df.phone = df.phone.fillna('').astype(str)
    df.phone = [p.split('.')[0] for p in df.phone]
    return df

def ngrams_phone(test_record, df_train):
    # manually convert test_record phone
    test_record.phone = test_record.fillna({'phone':''}).phone.astype(str)
    test_record.phone = test_record.phone.split('.')[0]
    df_train = convert_phones(df_train)
    corpus = list(df_train.phone)
    corpus.append(test_record.phone)
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [12]:
# New record to be tested arrives

In [13]:
def expand_df(df):
    df_list = []
    for (q, pred, pred_rec, score, s_name, s_email, s_phone, s_addr,  idx) in tqdm(
            zip(df.queried_record_id, df.predicted_record_id, df.predicted_record_id_record, df.cosine_score,
                df.name_cosine, df.email_cosine, df.phone_cosine, df.address_cosine, df.linked_id_idx)):
        for x in range(len(pred)):
            df_list.append((q, pred[x], pred_rec[x], score[x], s_name[x], s_email[x], s_phone[x], s_addr[x],  idx[x]))

    # TODO da cambiare predicted_record_id in predicted_linked_id e 'predicted_record_id_record' in 'predicted_record_id'
    df_new = pd.DataFrame(df_list, columns=['queried_record_id', 'predicted_record_id', 'predicted_record_id_record',
                                            'cosine_score', 'name_cosine',
                                            'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
                                            ])
    return df_new

In [14]:
def expand_similarities(test_record, df_train, k=50):
    sim_name = ngrams_name(test_record, df_train)
    sim_email = ngrams_email(test_record, df_train)
    sim_address = ngrams_address(test_record, df_train)
    sim_phone = ngrams_phone(test_record, df_train)
        
    hybrid = sim_name + 0.2 * sim_email + 0.2 * sim_phone + 0.2 * sim_address
    
    linid_ = []
    linid_idx = []
    linid_score = []
    linid_name_cosine = []
    linid_email_cosine = []
    linid_phone_cosine = []
    linid_address_cosine = []
    linid_record_id = []
    
    tr = df_train[['record_id', 'linked_id']]
    indices = hybrid.nonzero()[1][hybrid.data.argsort()[::-1]][:k]
    df = tr.loc[indices, :][:k]
    linid_.append(df['linked_id'].values)
    linid_idx.append(df.index)
    linid_record_id.append(df.record_id.values)
    linid_score.append(np.sort(hybrid.data)[::-1][:k]) # Questo ha senso perché tanto gli indices sono sortati in base allo scores di hybrid
    linid_name_cosine.append([sim_name[0, t] for t in indices])
    linid_email_cosine.append([sim_email[0, t] for t in indices])
    linid_phone_cosine.append([sim_phone[0, t] for t in indices])
    linid_address_cosine.append([sim_phone[0, t] for t in indices])
    
    df = pd.DataFrame()
    df['queried_record_id'] = [test_record.record_id]
    df['predicted_record_id'] = linid_
    df['predicted_record_id_record'] = linid_record_id
    df['cosine_score'] = linid_score
    df['name_cosine'] = linid_name_cosine
    df['email_cosine'] = linid_email_cosine
    df['phone_cosine'] = linid_phone_cosine
    df['address_cosine'] = linid_address_cosine
    df['linked_id_idx'] = linid_idx
    
    df_new = expand_df(df)
    
    return df_new

In [17]:
test_record_exp = expand_similarities(df_test.loc[0], df_train)

Done: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]                       
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
Done: 100%|██████████| 1/1 [00:00<00:00,  4.29it/s]                       
Done: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]                       
Done: 100%|██████████| 1/1 [00:00<00:00,  6.33it/s]                       


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [18]:
test_record_exp

Unnamed: 0,queried_record_id,predicted_record_id,predicted_record_id_record,cosine_score,name_cosine,email_cosine,phone_cosine,address_cosine,linked_id_idx
0,10000003-TST-MR,10010930,10010930-T2,0.533333,0.533333,0.0,0.0,0.0,19660
1,10000003-TST-MR,10010930,10010930,0.533333,0.533333,0.0,0.0,0.0,19655
2,10000003-TST-MR,10010930,10010930-NV0,0.533333,0.533333,0.0,0.0,0.0,19656
3,10000003-TST-MR,10010930,10010930-NV1,0.533333,0.533333,0.0,0.0,0.0,19657
4,10000003-TST-MR,10010930,10010930-T0,0.533333,0.533333,0.0,0.0,0.0,19658
5,10000003-TST-MR,10010930,10010930-T1,0.533333,0.533333,0.0,0.0,0.0,19659
6,10000003-TST-MR,10131433,10131433,0.473684,0.473684,0.0,0.0,0.0,234169
7,10000003-TST-MR,10148851,10148851,0.473684,0.473684,0.0,0.0,0.0,265520
8,10000003-TST-MR,10029541,10029541,0.444444,0.444444,0.0,0.0,0.0,52869
9,10000003-TST-MR,10027883,10027883,0.44,0.44,0.0,0.0,0.0,49967


# Add Features

In [15]:
import sys
sys.path.append('../')

In [16]:
from xgb_dataset_generation import adding_features

In [17]:
import os

In [22]:
test_record_exp = adding_features(test_record_exp, isValidation=False, path=os.path.join('..', 'dataset', 'original'))

Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx'],
      dtype='object')


100%|██████████| 50/50 [00:00<00:00, 5211.09it/s]

NaN on queried_name: 0
Nan on predicted_name: 0





HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
       'editdistance', 'jw_name', 'jw_address', 'jw_phone', 'jw_email',
       'email_popularity'],
      dtype='object')
Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
       'editdistance', 'jw_name', 'jw_address', 'jw_phone', 'jw_email',
       'email_popularity', 'linked_id_popularity', 'name_popularity'],
      dtype='object')
Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
       'editdistance', 'jw_name', 'jw_address', 'jw_phone', 'jw_email',
       'email_popularity', 'linked_id_popularity', 'name_popu

# LightGBM training

In [23]:
train = pd.concat([train1, train2])
eval_group = val.groupby('queried_record_id').size().values
group = train.groupby('queried_record_id').size().values

In [24]:
ranker = lgb.LGBMRanker()

In [25]:
print('Start LGBM...')
t1 = time.time()
ranker.fit(train.drop(['queried_record_id', 'target', 'predicted_record_id','predicted_record_id_record', 'linked_id_idx'], axis=1),
               train['target'], group=group,
               eval_set=[(val.drop(['queried_record_id', 'target', 'predicted_record_id','predicted_record_id_record', 'linked_id_idx'], axis=1), val['target'])],
               eval_group=[eval_group], early_stopping_rounds=5)
t2 = time.time()
print(f'Learning completed in {int(t2-t1)} seconds.')

Start LGBM...
[1]	valid_0's ndcg@1: 0.972762
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's ndcg@1: 0.975492
[3]	valid_0's ndcg@1: 0.977745
[4]	valid_0's ndcg@1: 0.979941
[5]	valid_0's ndcg@1: 0.980212
[6]	valid_0's ndcg@1: 0.98034
[7]	valid_0's ndcg@1: 0.980388
[8]	valid_0's ndcg@1: 0.981028
[9]	valid_0's ndcg@1: 0.980887
[10]	valid_0's ndcg@1: 0.982163
[11]	valid_0's ndcg@1: 0.982491
[12]	valid_0's ndcg@1: 0.982518
[13]	valid_0's ndcg@1: 0.982995
[14]	valid_0's ndcg@1: 0.983013
[15]	valid_0's ndcg@1: 0.983066
[16]	valid_0's ndcg@1: 0.98335
[17]	valid_0's ndcg@1: 0.983762
[18]	valid_0's ndcg@1: 0.983911
[19]	valid_0's ndcg@1: 0.984109
[20]	valid_0's ndcg@1: 0.984468
[21]	valid_0's ndcg@1: 0.984486
[22]	valid_0's ndcg@1: 0.984516
[23]	valid_0's ndcg@1: 0.984564
[24]	valid_0's ndcg@1: 0.984748
[25]	valid_0's ndcg@1: 0.984889
[26]	valid_0's ndcg@1: 0.985003
[27]	valid_0's ndcg@1: 0.984911
[28]	valid_0's ndcg@1: 0.985016
[29]	valid_0's ndcg@1: 0.98516
[30]	vali

In [31]:
predictions = ranker.predict(test_record_exp.drop(['queried_record_id','address_cosine',  'linked_id_idx', 'predicted_record_id','predicted_record_id_record'], axis=1))
test_record_exp['predictions'] = predictions

# Increment Training-set with the test record just evaluated

In [18]:
def get_linked_id(new_row):
    new_row['linked_id'] = new_row.record_id.split("-")
    new_row['linked_id'] = new_row.linked_id[0]
    new_row['linked_id'] = int(new_row.linked_id)
    return new_row

In [80]:
# aggiungendo la riga dobbiamo aggiungere anche il linked_id che va preso necessariamente dal name
new_row = df_test.loc[0]
new_row = get_linked_id(new_row)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [82]:
# Add new_row to original train
df_train = df_train.append(new_row, ignore_index=True)

In [119]:
# Add test_record_exp to train_expanded [in realtà non so se è necessario visto che il ranker deve solo imparare a 
# re-rankare le cose]
#ranker.refit(test_record_exp.drop(['queried_record_id','address_cosine',  'linked_id_idx', 'predicted_record_id','predicted_record_id_record', 'predictions'], axis=1),
#             test_record_exp['target'], group=[50])

# AttributeError: 'LGBMRanker' object has no attribute 'refit'

# In realtà per LightGBM si potrebbe pensare di re-trainare di nuovo il modello quando un batch di new_rows
# vengono aggiunte al train

# Two test record with same linked_id that are only in test, the first is added to the train, the second is predicted

In [19]:
only_test_recordid[only_test_recordid.duplicated('linked_id')]

Unnamed: 0,record_id,name,type,address,phone,email,modification,linked_id
339,10000500-TST-M,"EAST ASIA PALM CO., LTD",entity,,4.419598e+11,,move row,10000500
849,10001224-M1-TST-M,VICTORY SECRET LTD.,entity,,,,move row,10001224
850,10001224-TST-M,VICTORY SECRET LTD.,entity,,1.900123e+10,support@icloud.gov,move row,10001224
1593,10002261-TST-M,"LONGKOU FANLIN NODULAR CAST IRON PIPE CO.,LTD.",entity,,4.107031e+12,,move row,10002261
1925,10002752-TST-M,WALBRAY TRADING LTD.,entity,,,,move row,10002752
2181,10003124-TST-M,MANDELBAUM LTD. INC.,entity,,1.356913e+10,,move row,10003124
2598,10003732-TST-M,PROFIT SMART ENTERPRISES LTD.,entity,,3.943087e+09,help@zoho.com,move row,10003732
3899,10005571-TST-M,ENPREX S.A.,entity,,4.149003e+12,,move row,10005571
3920,10005601-T0-TST-M,Neotecmedical Inc.,entity,Neotecmedical Inc. 520 S. 7TH STREET SUITE C L...,1.937160e+10,sales@yahoo.ch,move row,10005601
3921,10005601-TST-M,Neotecmedical Inc.,entity,Neotecmedical Inc. 520 S. 7TH STREET SUITE C L...,1.937160e+10,sales@yahoo.ch,move row,10005601


In [20]:
new_row = df_test.loc[3920]
new_row = get_linked_id(new_row)

# Add new_row to original train
df_train = df_train.append(new_row, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc[key] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [21]:
# Rows 3920 - 3921

In [22]:
test_record_exp_1 = expand_similarities(df_test.loc[3921], df_train)
test_record_exp_1 = adding_features(test_record_exp_1, isValidation=False, path=os.path.join('..', 'dataset', 'original'), incremental_train=df_train)

Done: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]                       
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
Done: 100%|██████████| 1/1 [00:00<00:00,  4.28it/s]                       
Done: 100%|██████████| 1/1 [00:00<00:00,  1.46it/s]                       
Done: 100%|██████████| 1/1 [00:00<00:00,  6.47it/s]                       


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx'],
      dtype='object')


100%|██████████| 50/50 [00:00<00:00, 5226.81it/s]

NaN on queried_name: 0
Nan on predicted_name: 0





HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
       'editdistance', 'jw_name', 'jw_address', 'jw_phone', 'jw_email',
       'email_popularity'],
      dtype='object')
Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
       'editdistance', 'jw_name', 'jw_address', 'jw_phone', 'jw_email',
       'email_popularity', 'linked_id_popularity', 'name_popularity'],
      dtype='object')
Index(['queried_record_id', 'predicted_record_id',
       'predicted_record_id_record', 'cosine_score', 'name_cosine',
       'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
       'editdistance', 'jw_name', 'jw_address', 'jw_phone', 'jw_email',
       'email_popularity', 'linked_id_popularity', 'name_popu

In [23]:
test_record_exp_1

Unnamed: 0,queried_record_id,predicted_record_id,predicted_record_id_record,cosine_score,name_cosine,email_cosine,phone_cosine,address_cosine,linked_id_idx,editdistance,...,name_popularity,null_address,perc_non_null_address,null_email,perc_non_null_email,null_phone,perc_non_null_phone,case_typo,phone_popularity,test_name_length
0,10005601-TST-M,10005601,10005601-T0-TST-M,1.5375,1.0,1.0,0.8,0.8,691440,0,...,3,0,0.0,0,0.0,0,0.0,0,2,18
1,10005601-TST-M,10123132,10123132,0.409091,0.409091,0.0,0.0,0.0,219465,5,...,3,5,0.0,5,0.0,2,60.0,0,2,18
2,10005601-TST-M,10123132,10123132-M0,0.409091,0.409091,0.0,0.0,0.0,219466,5,...,3,5,0.0,5,0.0,2,60.0,0,2,18
3,10005601-TST-M,10123132,10123132-M1,0.409091,0.409091,0.0,0.0,0.0,219467,5,...,3,5,0.0,5,0.0,2,60.0,0,2,18
4,10005601-TST-M,10123132,10123132-T0,0.291667,0.291667,0.0,0.0,0.0,219468,6,...,3,5,0.0,5,0.0,2,60.0,0,2,18
5,10005601-TST-M,10153333,10153333-M0,0.275862,0.275862,0.0,0.0,0.0,273372,15,...,3,2,0.0,2,0.0,0,100.0,0,2,18
6,10005601-TST-M,10153333,10153333,0.275862,0.275862,0.0,0.0,0.0,273371,15,...,3,2,0.0,2,0.0,0,100.0,0,2,18
7,10005601-TST-M,12162757,12162757-NV0,0.241379,0.241379,0.0,0.0,0.0,560415,18,...,3,0,100.0,1,66.0,2,33.0,0,2,18
8,10005601-TST-M,12162757,12162757,0.241379,0.241379,0.0,0.0,0.0,560413,18,...,3,0,100.0,1,66.0,2,33.0,0,2,18
9,10005601-TST-M,12162757,12162757-M1,0.241379,0.241379,0.0,0.0,0.0,560414,18,...,3,0,100.0,1,66.0,2,33.0,0,2,18


In [118]:
predictions = ranker.predict(test_record_exp_1.drop(['queried_record_id','address_cosine',  'linked_id_idx', 'predicted_record_id','predicted_record_id_record'], axis=1))
test_record_exp_1['predictions'] = predictions

# TODO l'editdistance dà 18 perché la feature carica il train da '../dataset/original/train.csv' dunque non prende
# il train aggiornato

ValueError: Number of features of the model must match the input. Model n_features_ is 21 and input n_features is 22 