In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm 
import sys
sys.path.insert(1, '../oracle-polimi-contest-2019')
from evaluation_script import read_file
from collections import Counter
import similaripy as sim
from scipy import *
from scipy.sparse import *

In [2]:
import string
import unidecode
def create_name_letters_matrix(df):
    df = df[['record_id','name']]
    df.name = df.name.astype(str) # convert to string
    df.name = df.name.str.lower() # lowercase
    df.name = df.name.str.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    # remove accented letters
    no_accents = []
    for s in df.name:
        no_accents.append(unidecode.unidecode(s))
    df.name = no_accents
    # create return matrix
    columns = ['record_id','name','a','b','c','d','e','f','g','h','i','j','k','l',
               'm','n','o','p','q','r','s','t','u','v','w','x','y','z']
    name_letters_matrix = pd.DataFrame(columns=columns)
    name_letters_matrix.record_id = df.record_id.copy()
    name_letters_matrix.name = df.name.copy()
    # count occurence of each letter and add the columns to the return df
    for l in tqdm(['a','b','c','d','e','f','g','h','i','j','k','l','m','n',
                   'o','p','q','r','s','t','u','v','w','x','y','z']):
        new_col = []
        for (i,n) in zip(name_letters_matrix.index, name_letters_matrix.name):
            new_col.append(n.count(l))
        name_letters_matrix[l] = new_col
    return name_letters_matrix

In [3]:
def get_mcn_matrix_train(train):
    group = train[['name', 'linked_id']].groupby('linked_id').apply(lambda x: list(x['name']))
    link_mc_name = {}
    for (l, names) in tqdm(zip(group.keys(), group)):
        link_mc_name[l] = Counter(names).most_common(1)[0][0]
    
    most_common_name = pd.DataFrame.from_dict(link_mc_name, orient='index', columns=['most_common_name'])
    df_train_clean = pd.merge(train, most_common_name, how='left', left_on='linked_id', right_index=True)
    df_train_clean = df_train_clean.drop_duplicates(subset=['linked_id','most_common_name']).drop(['record_id', 'name'], axis=1)
    df_train_clean = df_train_clean.rename(columns={"linked_id":"record_id", "most_common_name":"name"})
    m_train = create_name_letters_matrix(df_train_clean)
    m_train = m_train.reset_index(drop=True)
    return m_train

In [4]:
def cosine_similarity(m_train, m_test, path='val_cosine', k=10):
    m_train_csr = csr_matrix(m_train.drop(['record_id','name'], axis=1))
    m_test_csr = csr_matrix(m_test.drop(['record_id','name'], axis=1))
    output = sim.cosine(m_test_csr, m_train_csr.T, k=k)
    save_npz(path + '.npz', output.tocsr())
    return output.tocsr()

In [5]:
def clean_cosine_output(output, df_test, m_train):
    output = output.tocsr()
    r_nnz = output.nonzero()[0]
    c_nnz = output.nonzero()[1]

    l = []
    for i in tqdm(range(len(r_nnz))):
        l.append([output[r_nnz[i], c_nnz[i]],r_nnz[i],c_nnz[i]])
     
    l.sort(key= lambda x: (x[1], -x[0]))
    
    rec_id = [x[1] for x in l]
    rec_id = [df_test.at[i,'record_id'] for i in tqdm(rec_id)]
    
    lin_id = [x[2] for x in l]
    lin_id = [m_train.at[i,'record_id'] for i in tqdm(lin_id)]
    scores = [x[0] for x in l]
    df = pd.DataFrame()
    df['queried_record_id'] = rec_id
    df['predicted_record_id'] = lin_id
    df['cosine_score'] = scores
    return df

In [6]:
# Splitting Train in Train-Validation set

In [6]:
train = read_file("../dataset/original/train.csv")

In [7]:
train = train.drop(['modification', 'type'], axis=1)
train['name'] = train['name'].str.lower()

In [8]:
from sklearn.model_selection import train_test_split

target = train.linked_id
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.33, random_state=42)

In [10]:
m_train = get_mcn_matrix_train(X_train)
m_train

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


HBox(children=(IntProgress(value=0, max=26), HTML(value='')))




Unnamed: 0,record_id,name,a,b,c,d,e,f,g,h,...,q,r,s,t,u,v,w,x,y,z
0,10098822,fromley trading inc,1,0,1,1,1,1,1,0,...,0,2,0,1,0,0,0,0,1,0
1,10074742,radiant investment management corp,4,0,1,1,4,0,1,0,...,0,2,1,4,0,1,0,0,0,0
2,10190457,rosemead ltd,1,0,0,2,2,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3,10187369,sperry corp,0,0,1,0,1,0,0,0,...,0,3,1,0,0,0,0,0,1,0
4,12141090,isidoro toledo esquenazi,1,0,0,2,3,0,0,0,...,1,1,2,1,1,0,0,0,0,1
5,12165506,gabriel davidov pardo,3,1,0,3,1,0,1,0,...,0,2,0,0,0,2,0,0,0,0
6,12207761,hhoutianhua,2,0,0,0,0,0,0,3,...,0,0,0,1,2,0,0,0,0,0
7,12134974,tian quan enterprises limited,2,0,0,1,4,0,0,0,...,1,2,2,3,1,0,0,0,0,0
8,10211979,fundacion allan rausch,4,0,2,1,0,1,0,1,...,0,1,1,0,2,0,0,0,0,0
9,10213246,zinger international gmbh,2,1,0,0,2,0,2,1,...,0,2,0,2,0,0,0,0,0,1


In [11]:
m_test = create_name_letters_matrix(X_val)

HBox(children=(IntProgress(value=0, max=26), HTML(value='')))




In [12]:
cosine_output = cosine_similarity(m_train, m_test)

Done: 100%|██████████| 228176/228176 [14:31<00:00, 261.68it/s]            


In [20]:
X_val = X_val.reset_index(drop=True)

In [21]:
# Extract top10 from cosine similarity and create xgboost skeleton dataframe: validation set becomes xgboost train
xgb_train_df = clean_cosine_output(cosine_output, X_val, m_train)

HBox(children=(IntProgress(value=0, max=2281740), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2281740), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2281740), HTML(value='')))




In [22]:
xgb_train_df

Unnamed: 0,queried_record_id,predicted_record_id,cosine_score
0,10127893-T1,10127893,1.000000
1,10127893-T1,10019372,0.984425
2,10127893-T1,10211614,0.984425
3,10127893-T1,10058787,0.977265
4,10127893-T1,10181531,0.976417
5,10127893-T1,10117117,0.976417
6,10127893-T1,10175281,0.976417
7,10127893-T1,10138368,0.976417
8,10127893-T1,10108944,0.975223
9,10127893-T1,10185271,0.975197


## The same for the real test set

In [23]:
test = read_file("../oracle-polimi-contest-2019/test_data.csv")
test = test.drop(['modification', 'type'], axis=1)
test['name'] = test['name'].str.lower()

In [None]:
m_train_full = get_mcn_matrix_train(train)
m_test_full = create_name_letters_matrix(test)

In [25]:
m_train_full.shape

(267244, 28)

In [26]:
m_test_full.shape

(266955, 28)

In [28]:
full_cosine_out = cosine_similarity(m_train_full, m_test_full, path='full_cosine_sim')
xgb_test_df = clean_cosine_output(full_cosine_out, test, m_train_full)



  0%|          | 0/266955 [00:00<?, ?it/s][A[A

Preprocessing:   0%|          | 0/266955 [00:00<?, ?it/s][A[A

Allocate memory per threads:   0%|          | 0/266955 [00:00<?, ?it/s][A[A

Computing:   0%|          | 533/266955 [00:03<29:34, 150.13it/s]       [A[A

Computing:   0%|          | 1066/266955 [00:06<28:51, 153.52it/s][A[A

Computing:   1%|          | 1599/266955 [00:10<29:23, 150.43it/s][A[A

Computing:   1%|          | 2132/266955 [00:14<29:01, 152.05it/s][A[A

Computing:   1%|          | 2665/266955 [00:17<28:44, 153.22it/s][A[A

Computing:   1%|          | 3198/266955 [00:21<29:14, 150.30it/s][A[A

Computing:   1%|▏         | 3731/266955 [00:24<28:57, 151.51it/s][A[A

Computing:   2%|▏         | 4264/266955 [00:28<28:58, 151.13it/s][A[A

Computing:   2%|▏         | 4797/266955 [00:31<28:41, 152.30it/s][A[A

Computing:   2%|▏         | 5330/266955 [00:34<28:18, 154.01it/s][A[A

Computing:   2%|▏         | 5863/266955 [00:37<27:58, 155.55it/s][A

Computing:  44%|████▎     | 116727/266955 [09:44<12:32, 199.61it/s][A[A

Computing:  44%|████▍     | 117260/266955 [09:47<12:29, 199.71it/s][A[A

Computing:  44%|████▍     | 117793/266955 [09:49<12:26, 199.80it/s][A[A

Computing:  44%|████▍     | 118326/266955 [09:51<12:23, 199.89it/s][A[A

Computing:  45%|████▍     | 118859/266955 [09:54<12:20, 199.97it/s][A[A

Computing:  45%|████▍     | 119392/266955 [09:56<12:17, 200.06it/s][A[A

Computing:  45%|████▍     | 119925/266955 [09:59<12:14, 200.14it/s][A[A

Computing:  45%|████▌     | 120458/266955 [10:01<12:11, 200.23it/s][A[A

Computing:  45%|████▌     | 120991/266955 [10:04<12:08, 200.31it/s][A[A

Computing:  46%|████▌     | 121524/266955 [10:06<12:05, 200.40it/s][A[A

Computing:  46%|████▌     | 122057/266955 [10:08<12:02, 200.48it/s][A[A

Computing:  46%|████▌     | 122591/266955 [10:11<11:59, 200.57it/s][A[A

Computing:  46%|████▌     | 123123/266955 [10:13<11:56, 200.65it/s][A[A

Computing:  46%|████▋    

Computing:  87%|████████▋ | 232921/266955 [18:42<02:44, 207.49it/s][A[A

Computing:  87%|████████▋ | 233454/266955 [18:45<02:41, 207.51it/s][A[A

Computing:  88%|████████▊ | 233987/266955 [18:47<02:38, 207.54it/s][A[A

Computing:  88%|████████▊ | 234520/266955 [18:49<02:36, 207.57it/s][A[A

Computing:  88%|████████▊ | 235053/266955 [18:52<02:33, 207.60it/s][A[A

Computing:  88%|████████▊ | 235586/266955 [18:54<02:31, 207.63it/s][A[A

Computing:  88%|████████▊ | 236119/266955 [18:57<02:28, 207.66it/s][A[A

Computing:  89%|████████▊ | 236652/266955 [18:59<02:25, 207.69it/s][A[A

Computing:  89%|████████▉ | 237185/266955 [19:01<02:23, 207.72it/s][A[A

Computing:  89%|████████▉ | 237718/266955 [19:04<02:20, 207.75it/s][A[A

Computing:  89%|████████▉ | 238251/266955 [19:06<02:18, 207.78it/s][A[A

Computing:  89%|████████▉ | 238784/266955 [19:09<02:15, 207.80it/s][A[A

Computing:  90%|████████▉ | 239317/266955 [19:11<02:12, 207.83it/s][A[A

Computing:  90%|████████▉

HBox(children=(IntProgress(value=0, max=2669510), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2669510), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2669510), HTML(value='')))




In [29]:
xgb_test_df

Unnamed: 0,queried_record_id,predicted_record_id,cosine_score
0,10051937-TST-MR,10110955,0.968152
1,10051937-TST-MR,10117193,0.956102
2,10051937-TST-MR,12212206,0.956102
3,10051937-TST-MR,12106617,0.952661
4,10051937-TST-MR,10036617,0.952661
5,10051937-TST-MR,10040198,0.952036
6,10051937-TST-MR,10203106,0.950654
7,10051937-TST-MR,10136888,0.947255
8,10051937-TST-MR,10211735,0.946762
9,10051937-TST-MR,10145409,0.945905


# Extract features

In [None]:
def adding_names(xgb_df, m_train, m_test):
    xgb_df = df.merge(m_train[['record_id', 'name']], left_on='predicted_record_id', right_on='record_id').drop('record_id', axis=1)
    xgb_df = xgb_df.rename(columns={'name': 'predicted_record_name'})
    xgb_df = xgb_df.merge(m_test[['record_id', 'name']], left_on='queried_record_id', right_on='record_id' ).rename(columns={'name':'queried_name'})
    xgb_df = xgb_df.drop('record_id', axis=1)
    return xgb_df

In [None]:
def extract_target(predicted, linked):
    res = np.empty(len(predicted))
    res = np.where(predicted == linked, 1, 0)
    return res

def train_target(xgb_df_train, X_val):
    xgb_df_train = xgb_df_train.merge(X_val[['record_id', 'linked_id']], left_on='queried_record_id', right_on='record_id')
    xgb_df_train = xgb_df_train.drop('record_id', axis=1)
    xgb_df_train['linked_id'] = xgb_df_train['linked_id'].astype(int)
    xgb_df_train['target'] = extract_target(xgb_df_train.predicted_record_id.values, xgb_df_train.linked_id.values)
    return xgb_df_train.drop('linked_id', axis=1)

In [None]:
def extract_editdistance(queried_name, predicted_name):
    res = np.empty(len(queried_name))
    for i in tqdm(range(len(queried_name))):
        res[i] = editdistance.eval(queried_name[i], predicted_name[i])
    return res

In [None]:
xgb_train_df = train_target(xgb_train_df, X_val)
xgb_train_df['editdistance'] = extract_editdistance(xgb_train_df.predicted_record_name.values, xgb_train_df.queried_name.values)

In [None]:
# TODO da concludere questa parte: aggiungere le stesse features anche per xgb_test_df

In [None]:
import xgboost as xgb

group = xgb_train_df.groupby('queried_record_id').size().values
ranker = xgb.XGBRanker()
ranker.fit(df_xgb.drop(['queried_record_id', 'target', 'nysiis_distance'], axis=1), df_xgb['target'], group=group)

In [None]:
# Get predictions

In [None]:
predictions = ranker.predict(xgb_test_df[['predicted_record_id', 'score', 'editdistance']])
xgb_test_df['predictions'] = predictions
df_predictions = xgb_test_df[['queried_record_id', 'predicted_record_id', 'predictions']]

# Extract Submission

In [None]:
rec_pred = []
for (r,p) in zip(df_predictions.predicted_record_id, df_predictions.predictions):
    rec_pred.append((r, p))
rec_pred

In [None]:
df_predictions['rec_pred'] = rec_pred
group_queried = df_predictions[['queried_record_id', 'rec_pred']].groupby('queried_record_id').apply(lambda x: list(x['rec_pred']))
df_predictions = pd.DataFrame(group_queried).reset_index().rename(columns={0 : 'rec_pred'})

In [None]:
def reorder_preds(preds):
    sorted_list = []
    for i in range(len(preds)):
        l = sorted(preds[i], key=lambda t: t[1], reverse=True)
        l = [x[0] for x in l]
        sorted_list.append(l)
    return sorted_list

In [None]:
df_predictions['ordered_preds'] = reorder_preds(df_predictions.rec_pred.values)
df_predictions = df_predictions[['queried_record_id', 'ordered_preds']].rename(columns={'ordered_preds': 'predicted_record_id'})

In [None]:
new_col = []
for t in tqdm(df_predictions.predicted_record_id):
    new_col.append(' '.join([str(x) for x in t]))
new_col

In [None]:
# Adding missing values
missing_values = {'queried_record_id' : ['12026587-TST-MR', '13009531-TST-MR', '12091134-TST-M', '12091134-NV0-TST-CP'], 
                 'predicted_record_id': [10111147, 10111147, 10111147, 10111147]}
missing_df = pd.DataFrame(missing_values)
missing_df

In [None]:
df_predictions.predicted_record_id = new_col
df_predictions = pd.concat([df_predictions, missing_df])

In [None]:
df_predictions.to_csv('xgb_sub2.csv', index=False)