In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [None]:
from scipy import *
from scipy.sparse import *
import similaripy as sim
from sklearn.feature_extraction.text import CountVectorizer
import re

In [None]:
import sys
sys.path.append('../')
from xgb_dataset_generation import adding_features
import os

In [None]:
import lightgbm as lgb
import time

Prendiamo un elemento alla volta dal test, facciamo la similarità di quell'elemento ed estraiamo il top50. Quell'elemento poi verrà predetto. Una volta tirata fuori la predizione di quell'elemento, va aggiunto al train [di LightGBM]: non importa rifare la similarità perché valutando un elemento alla volta sappiamo già dalla similarità che facciamo se quell'elemento è nel train o no. 

In [None]:
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")
df_train = df_train.sort_values(by='record_id').reset_index(drop=True)
df_test = df_test.sort_values(by='record_id').reset_index(drop=True)

df_train.linked_id = df_train.linked_id.astype(int)

In [None]:
df_test['linked_id'] = df_test.record_id.str.split("-")
df_test['linked_id'] = df_test.linked_id.apply(lambda x: x[0])
df_test.linked_id = df_test.linked_id.astype(int)
#df_train.linked_id = df_train.linked_id.astype(int)
only_test = set(df_test.linked_id.values) - set(df_train.linked_id.values)
only_test_recordid = df_test[df_test.linked_id.isin(only_test)]
df_test = df_test.drop('linked_id', axis=1)

In [None]:
train1 = pd.read_csv("../dataset/validation_2/train_complete.csv")
train2 = pd.read_csv("../dataset/validation_3/train_complete.csv")
val = pd.read_csv("../dataset/validation/train_complete.csv")

In [8]:
def remove_spaces(s, n=3):
    s = re.sub(' +',' ',s).strip()
    ngrams = zip(*[s[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [9]:
def ngrams_name(test_record, df_train):
    df_train.name = df_train.name.astype(str)
    test_record['name'] = test_record['name'].astype(str)
    corpus = list(df_train.name)
    corpus.append(test_record['name'])
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [10]:
def ngrams_name_fast(test_record, vectorizer, X_train):
#     deve:
#     - prendere il nome del test
#     - vettorizzarlo
#     - calcolare la similarità
#     - ritornare la similarità con una nuova riga e una nuova colonna
#     - ritornare X_train con una nuova riga (la vettorizzazione del nuovo record)
#     ?
    X_test = vectorizer.transform([test_record['name']])
    similarity = sim.jaccard(X_test, X_train.T, k=300).tocsr()
    return similarity.tocsr(), vstack([X_train,X_test])

In [11]:
def ngrams_address(test_record, df_train):
    df_train.address = df_train.address.fillna('').astype(str)
    test_record.address = test_record.fillna({'address':''}).address
    corpus = list(df_train.address)
    corpus.append(test_record.address)
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [12]:
def ngrams_address_fast(test_record, vectorizer, X_train):
    test_record.address = test_record.fillna({'address':''}).address
    X_test = vectorizer.transform([test_record.address])
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr(), vstack([X_train,X_test])

In [13]:
def ngrams_email(test_record, df_train):
    df_train.email = df_train.email.fillna('').astype(str)
    test_record.email = test_record.fillna({'email':''}).email
    corpus = list(df_train.email) 
    corpus.append(test_record.email)
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [14]:
def ngrams_email_fast(test_record, vectorizer, X_train):
    test_record.email = test_record.fillna({'email':''}).email
    X_test = vectorizer.transform([test_record.email])
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr(), vstack([X_train,X_test])

In [15]:
def convert_phones(df_in):
    """
    This functions transforms the phone column from scientific notation to readable string
    format, e.g. 1.2933+E10 to 12933000000
    : param df_in : the original df with the phone in scientific notation
    : return : the clean df
    """
    df = df_in.copy()
    df.phone = df.phone.fillna('').astype(str)
    df.phone = [p.split('.')[0] for p in df.phone]
    return df

def ngrams_phone(test_record, df_train):
    # manually convert test_record phone
    if np.isnan(test_record.phone):
        test_record.phone = test_record.fillna({'phone':''}).phone
    else:
        test_record.phone = test_record.fillna({'phone':''}).phone.astype(str)
    test_record.phone = test_record.phone.split('.')[0]
    df_train = convert_phones(df_train)
    corpus = list(df_train.phone)
    corpus.append(test_record.phone)
    vectorizer = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X = vectorizer.fit_transform(corpus)
    X_train = X[:df_train.shape[0],:]
    X_test = X[df_train.shape[0]:,:]
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr()

In [16]:
def ngrams_phone_fast(test_record, vectorizer, X_train):
    # manually convert test_record phone
    if np.isnan(test_record.phone):
        test_record.phone = test_record.fillna({'phone':''}).phone
    else:
        test_record.phone = test_record.fillna({'phone':''}).phone.astype(str)
    test_record.phone = test_record.phone.split('.')[0]
    X_test = vectorizer.transform([test_record.phone])
    similarity = sim.jaccard(X_test, X_train.T, k=300)
    return similarity.tocsr(), vstack([X_train,X_test])

In [17]:
def expand_df(df):
    df_list = []
    for (q, pred, pred_rec, score, s_name, s_email, s_phone, s_addr,  idx) in tqdm(
            zip(df.queried_record_id, df.predicted_record_id, df.predicted_record_id_record, df.cosine_score,
                df.name_cosine, df.email_cosine, df.phone_cosine, df.address_cosine, df.linked_id_idx)):
        for x in range(len(pred)):
            df_list.append((q, pred[x], pred_rec[x], score[x], s_name[x], s_email[x], s_phone[x], s_addr[x],  idx[x]))

    # TODO da cambiare predicted_record_id in predicted_linked_id e 'predicted_record_id_record' in 'predicted_record_id'
    df_new = pd.DataFrame(df_list, columns=['queried_record_id', 'predicted_record_id', 'predicted_record_id_record',
                                            'cosine_score', 'name_cosine',
                                            'email_cosine', 'phone_cosine', 'address_cosine', 'linked_id_idx',
                                            ])
    return df_new

In [18]:
def expand_similarities(test_record, vectorizer_name, vectorizer_email, vectorizer_phone, vectorizer_address, 
                        X_train_name, X_train_email, X_train_phone, X_train_address, k=50):
    
    sim_name, X_train_name_new = ngrams_name_fast(test_record, vectorizer_name, X_train_name)
    sim_email, X_train_email_new = ngrams_email_fast(test_record, vectorizer_email, X_train_email)
    sim_phone, X_train_phone_new = ngrams_phone_fast(test_record, vectorizer_phone, X_train_phone)
    sim_address, X_train_address_new = ngrams_address_fast(test_record, vectorizer_address, X_train_address)
        
    hybrid = sim_name + 0.2 * sim_email + 0.2 * sim_phone + 0.2 * sim_address
    
    linid_ = []
    linid_idx = []
    linid_score = []
    linid_name_cosine = []
    linid_email_cosine = []
    linid_phone_cosine = []
    linid_address_cosine = []
    linid_record_id = []
    
    tr = df_train[['record_id', 'linked_id']]
    indices = hybrid.nonzero()[1][hybrid.data.argsort()[::-1]][:k]
    df = tr.loc[indices, :][:k]
    linid_.append(df['linked_id'].values)
    linid_idx.append(df.index)
    linid_record_id.append(df.record_id.values)
    linid_score.append(np.sort(hybrid.data)[::-1][:k]) # Questo ha senso perché tanto gli indices sono sortati in base allo scores di hybrid
    linid_name_cosine.append([sim_name[0, t] for t in indices])
    linid_email_cosine.append([sim_email[0, t] for t in indices])
    linid_phone_cosine.append([sim_phone[0, t] for t in indices])
    linid_address_cosine.append([sim_phone[0, t] for t in indices])
    
    df = pd.DataFrame()
    df['queried_record_id'] = [test_record.record_id]
    df['predicted_record_id'] = linid_
    df['predicted_record_id_record'] = linid_record_id
    df['cosine_score'] = linid_score
    df['name_cosine'] = linid_name_cosine
    df['email_cosine'] = linid_email_cosine
    df['phone_cosine'] = linid_phone_cosine
    df['address_cosine'] = linid_address_cosine
    df['linked_id_idx'] = linid_idx
    
    df_new = expand_df(df)
    
    return df_new, X_train_name_new, X_train_email_new, X_train_phone_new, X_train_address_new

In [19]:
def get_linked_id(new_row):
    new_row['linked_id'] = new_row.record_id.split("-")
    new_row['linked_id'] = new_row.linked_id[0]
    new_row['linked_id'] = int(new_row.linked_id)
    return new_row

In [20]:
def create_all_vectorizers(df_train):
    # vectorizer Name
    df_train.name = df_train.name.astype(str)
    corpus_name = list(df_train.name)
    vectorizer_name = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X_train_name = vectorizer_name.fit_transform(corpus_name)

    # vectorizer Email
    df_train.email = df_train.email.fillna('').astype(str)
    corpus_email = list(df_train.email) 
    vectorizer_email = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X_train_email = vectorizer_email.fit_transform(corpus_email)

    # vectorizer Address
    df_train.address = df_train.address.fillna('').astype(str)
    corpus_address = list(df_train.address)
    vectorizer_address = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X_train_address = vectorizer_address.fit_transform(corpus_address)

    # vectorizer Phone
    df_train = convert_phones(df_train)
    corpus_phone = list(df_train.phone)
    vectorizer_phone = CountVectorizer(preprocessor = remove_spaces, analyzer=remove_spaces)
    X_train_phone = vectorizer_phone.fit_transform(corpus_phone)
    
    return vectorizer_name, vectorizer_email, vectorizer_address, vectorizer_phone, X_train_name, X_train_email, X_train_address, X_train_phone 

# Example

In [None]:
vectorizer_name, vectorizer_email, vectorizer_address, vectorizer_phone, X_train_name, X_train_email, X_train_address, X_train_phone= create_all_vectorizers(df_train)
t1 = time.time()
test_record_exp, X_train_name_new, X_train_email_new, X_train_phone_new, X_train_address_new = expand_similarities(df_test.loc[2], vectorizer_name, vectorizer_email, vectorizer_phone, vectorizer_address, 
                        X_train_name, X_train_email, X_train_phone, X_train_address)
t2 = time.time()
t2-t1

In [None]:
test_record_exp

# Create Sequential Test Answers

Dobbiamo creare un test set sequenziale [con le relative risposte], in modo che se ci sono due record che fanno riferimento allo stesso linked_id che però sono presenti solo nel test, il primo che arriva non ha riferimenti nel train ed è impossibile predirlo correttamente; viene però aggiunto al train, dunque quando il secondo record deve essere valutato, allora la risposta corretta corrisponde a quello precedentemente.

In [None]:
group_train = df_train.groupby('linked_id').apply(lambda x: list(x['record_id'])).reset_index().rename(columns={0:'record_ids'})

In [None]:
df_test

In [None]:
df_test['linked_id'] = df_test.record_id.str.split('-')
df_test['linked_id'] = df_test.linked_id.apply(lambda x: x[0])
df_test.linked_id = df_test.linked_id.astype(int)

In [None]:
df_test = df_test.merge(group_train, how= 'left', on='linked_id')

In [None]:
def seq_labelling(df):
    linked_seen = []
    new_group = { }
    new_df = []
    df_notna = df[~df.record_ids.isna()]
    df_na = df[df.record_ids.isna()]
    relevant_idx= []
    for (i, l, r) in tqdm(zip(df_na.index, df_na.linked_id, df_na.record_id)):
        if l not in linked_seen:
            linked_seen.append(l)
            new_df.append((i, r, np.nan))
            new_group[l] = [r]
        else:
            current_group = new_group[l]
            #print(f'{r} and the group: {current_group}')
            new_df.append((i, r, current_group))
            new_group[l].append(r)
            relevant_idx.append(i)
    res = pd.DataFrame(new_df, columns=['index','record_id', 'record_ids']).set_index('index')
    full_df = pd.concat([df_notna, res])
    full_df = full_df.sort_index()
    return full_df, relevant_idx

def get_target(df):
    df, idx = seq_labelling(df)
    df_notidx = df.loc[~df.index.isin(idx)]
    df = df.loc[idx]
    new_df = []
    for (i,r, g) in tqdm(zip(df.index, df.record_id, df.record_ids)):
        idx = g.index(r)
        g = g[:idx]
        new_df.append((i, r, g))
    new_df = pd.DataFrame(new_df, columns=['index','record_id', 'record_ids']).set_index('index')
    res = pd.concat([new_df, df_notidx])
    res = res.sort_index()
    return res

In [None]:
seq = get_target(df_test[['record_id', 'linked_id','record_ids']])

# Incremental Evaluation

## Nota
Invece di fare le predizioni su tutto il test, visto che sappiamo come performa l'algoritmo classico su quei record che hanno riferimenti nel train, possiamo restringere l'analisi ai record del test che non hanno riferimenti nel train e che hanno duplicati all'interno del test stesso [per quelli che non sono duplicati non ha senso fare tale analisi perché non forniscono nessun contributo alle query successive]

In [None]:
duplicates_df = only_test_recordid[only_test_recordid.duplicated('linked_id', keep=False)]

In [None]:
duplicates_df

In [None]:
idx = duplicates_df.index

In [None]:
def reorder_preds(preds):
    ordered_lin = []
    ordered_score = []
    ordered_record = []
    for i in range(len(preds)):
        l = sorted(preds[i], key=lambda t: t[1], reverse=True)
        lin = [x[0] for x in l]
        s = [x[1] for x in l]
        r = [x[2] for x in l]
        ordered_lin.append(lin)
        ordered_score.append(s)
        ordered_record.append(r)
    return ordered_lin, ordered_score, ordered_record

In [None]:
X_train_name

In [None]:
vgfsd = vectorizer_name.transform(['vgfsd'])
vstack([X_train_name,vgfsd])

In [None]:
# Re-load training set
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_train = df_train.sort_values(by='record_id').reset_index(drop=True)
df_train.linked_id = df_train.linked_id.astype(int)
# get vectorizers
vectorizer_name, vectorizer_email, vectorizer_address, vectorizer_phone, X_train_name, X_train_email, X_train_address, X_train_phone= create_all_vectorizers(df_train)

# Online Learning    -------------> TODO: ci mette troppo, circa 7ore
test_prediction = pd.DataFrame()
seen = []

for i in tqdm(idx[:10]):
    test_row = duplicates_df.loc[i]
    print(f'Record: {test_row.record_id}')
    t1 = time.time()
    #print(f'Extract test record at index: {i}')
    # Get the test record to be evaluate
    #print('Create Features')
    test_row_exp, X_train_name, X_train_email, X_train_phone, X_train_address = expand_similarities(test_row, vectorizer_name, vectorizer_email, vectorizer_phone, vectorizer_address, 
                        X_train_name, X_train_email, X_train_phone, X_train_address)
    test_row_exp = adding_features(test_row_exp, isValidation=False, path=os.path.join('..', 'dataset', 'original'), incremental_train=df_train)
    
    # Get predictions
    #print('Get Predictions')
    predictions = ranker.predict(test_row_exp.drop(['queried_record_id','linked_id_idx', 'predicted_record_id','predicted_record_id_record'], axis=1))
    test_row_exp['predictions'] = predictions
    df_predictions = test_row_exp[['queried_record_id', 'predicted_record_id', 'predicted_record_id_record', 'predictions']]
    
    # Re-order predictions
    #print('Reorder Predictions')
    rec_pred = []
    for (l,p,record_id) in zip(df_predictions.predicted_record_id, df_predictions.predictions, df_predictions.predicted_record_id_record):
        rec_pred.append((l, p, record_id))

    df_predictions['rec_pred'] = rec_pred
    group_queried = df_predictions[['queried_record_id', 'rec_pred']].groupby('queried_record_id').apply(lambda x: list(x['rec_pred']))
    df_predictions = pd.DataFrame(group_queried).reset_index().rename(columns={0 : 'rec_pred'})
    
    # Store predictions
    #print('Store Predictions')
    df_predictions['ordered_linked'], df_predictions['ordered_scores'], df_predictions['ordered_record'] = reorder_preds(df_predictions.rec_pred.values)
    test_prediction = pd.concat([test_prediction, df_predictions], ignore_index=True)
    new_row = get_linked_id(test_row)
    df_train = df_train.append(new_row, ignore_index=True)
    t2 = time.time()
    #print(f'Iteration completed in {t2 - t1}s')

In [None]:
test_prediction

L'idea era che una volta calcolate le predizioni 'sequenziali', possiamo guardare quanto queste siano corrette matchandole con seq['record_ids']

# Problemi:
- ci mette davvero troppo, soprattutto il calcolo delle similarità ed estrazione delle topN, trovare un modo più rapido