In [2]:
import numpy as np
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import MinMaxScaler
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
HOME_DIR = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files'
table_path = f'{HOME_DIR}/cricketers.csv'
wikify_column_name = "cricketers"
final_score_column = "siamese_prediction"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count'
temp_dir = f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/temp/apr_test.csv'
model_file_path = './models/weighted_lr.pkl'
ranking_model_file_path = './models/epoch_2_loss_0.09150885790586472_top1_0.9067796610169492.pth'
min_max_scaler_path = './models/normalization_factor.pkl'
model_voted_candidate_file_path = f'{HOME_DIR}/temp/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/temp/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/temp/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/temp/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/temp/lof_feature.csv'

output_model_pred_file = f'{HOME_DIR}/temp/model_prediction.csv'
top5_links = f'{HOME_DIR}/temp/top5_links.csv'
colorized_kg_links = f'{HOME_DIR}/temp/colorized_kg_links.xlsx'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
class_count_file = f'{HOME_DIR}/temp/class_count.tsv'
property_count_file = f'{HOME_DIR}/temp/property_count.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

!mkdir -p $temp_dir

### Canonicalize

In [4]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" \
> "$canonical_file_path"

In [5]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88
1,0,1,Tendulkar,mumbai indians|137|24/04/1973
2,0,2,Dhoni,chennai super kings|154|7/7/81
3,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93
4,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88


### Candidate Generation

In [6]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented \
get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

In [7]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t'))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{HOME_DIR}/temp/{field}.tsv', sep='\t', index=False)

In [8]:
pd.read_csv(candidate_file_path, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.407864
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.486897
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.486897
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.586235
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.525568


In [9]:
!ls $temp_dir

exact_matches_class_count.tsv
exact_matches_graph_embedding_complex.tsv
exact_matches_property_count.tsv
fuzzy_augmented_class_count.tsv
fuzzy_augmented_graph_embedding_complex.tsv
fuzzy_augmented_property_count.tsv


### Generate lof-related features: lof-graph-embedding-score, lof-reciprocal-rank, lof-tfidf
##### Generate required 4 features for voting classifier

In [10]:
!tl align-page-rank $candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaro_winkler -o jaro_winkler \
/ string-similarity -i --method levenshtein -o levenshtein \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
/ mosaic-features -c kg_labels --num-char --num-tokens \
/ create-singleton-feature -o singleton \
> $aligned_pagerank_candidate_file_path

In [11]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,3.983031e-09,3.983031e-09,0,1.0,0.0
1,fuzzy-augmented,3.983031e-09,0.0,0,1.0,0.0
2,fuzzy-augmented,5.918546e-09,0.0,0,0.772222,0.0
3,fuzzy-augmented,3.740191e-09,0.0,0,0.640476,0.0
4,fuzzy-augmented,0.0,0.0,0,0.75,0.0


##### Generate model-voted candidates result

In [12]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model $model_file_path \
> $model_voted_candidate_file_path

In [13]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,monge_elkan,jaro_winkler,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,1.0,1.0,1.0,0.0,0.0,0,11,2,1,0
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,1.0,1.0,1.0,0.0,0.0,0,11,2,0,0
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,0.772222,0.569697,0.181818,0.0,0.0,0,11,2,0,0
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,0.640476,0.55711,0.230769,0.0,0.0,0,13,2,0,0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,0.75,0.890909,0.454545,0.0,0.0,0,5,1,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [14]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o lof-graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 1260
Qnodes from file: 1241
Qnodes from server: 0
_centroid_of_lof: Missing 1 of 8
Outlier removal generates 4 lof-voted candidates


In [15]:
score_df = pd.read_csv(graph_embedding_file_path)
score_df.head(5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,levenshtein,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,1.0,0.0,0.0,0,11,2,1,0,-1,0.804745
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,1.0,0.0,0.0,0,11,2,0,0,-1,0.804745
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,0.181818,0.0,0.0,0,11,2,0,0,-1,0.440108
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,0.230769,0.0,0.0,0,13,2,0,0,-1,0.580851
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,0.454545,0.0,0.0,0,5,1,0,0,-1,0.0


In [16]:
score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).loc[:, [
    'kg_id', 'kg_labels', 'kg_descriptions', 'method', 'singleton', 'vote_by_classifier', 'is_lof', 'lof-graph-embedding-score'
]].head(20)

Unnamed: 0,kg_id,kg_labels,kg_descriptions,method,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
141,Q142613,Cheteshwar Pujara,Indian cricket player,fuzzy-augmented,0,1,1,0.966947
140,Q142613,Cheteshwar Pujara,Indian cricket player,exact-match,1,1,1,0.966947
667,Q137669,Ajinkya Rahane,Indian cricketer,fuzzy-augmented,0,1,1,0.965528
666,Q137669,Ajinkya Rahane,Indian cricketer,exact-match,1,1,1,0.965528
241,Q3522062,Ishant Sharma,Indian cricket player.,exact-match,1,0,-1,0.926515
242,Q3522062,Ishant Sharma,Indian cricket player.,fuzzy-augmented,0,0,-1,0.926515
1237,Q7336038,Rishi Dhawan,Indian cricketer,fuzzy-augmented,0,0,-1,0.920234
891,Q2003153,Bhuvneshwar Kumar,Indian cricket player,fuzzy-augmented,0,0,-1,0.907008
890,Q2003153,Bhuvneshwar Kumar,Indian cricket player,exact-match,1,0,-1,0.907008
579,Q16227998,Jasprit Bumrah,cricketer,fuzzy-augmented,0,0,-1,0.903476


##### Generate lof reciprocal rank feature

In [17]:
!tl generate-reciprocal-rank "$graph_embedding_file_path" \
-c lof-graph-embedding-score \
-o lof-reciprocal-rank \
> "$lof_reciprocal_rank_file_path"

In [18]:
pd.read_csv(lof_reciprocal_rank_file_path, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,num_char,num_tokens,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score,lof-reciprocal-rank
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,0.0,0.0,0,11,2,1,0,-1,0.804745,0.333333
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,0.0,0.0,0,11,2,0,0,-1,0.804745,0.25
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,0.0,0.0,0,11,2,0,0,-1,0.440108,0.013514
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,0.0,0.0,0,13,2,0,0,-1,0.580851,0.028571
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,0.0,0.0,0,5,1,0,0,-1,0.0,0.009901


##### Generate lof tfidf feature

In [19]:
!tl compute-tf-idf "$lof_reciprocal_rank_file_path" \
--feature-file "$class_count_file" \
--feature-name class_count \
--singleton-column is_lof \
-o lof_class_count_tf_idf_score \
/ compute-tf-idf \
--feature-file "$property_count_file" \
--feature-name property_count \
--singleton-column is_lof \
-o lof_property_count_tf_idf_score \
> "$lof_feature_file"

In [20]:
d = pd.read_csv(lof_feature_file, nrows=5)

### Model Prediction

In [3]:
ranking_model_file_path

'./models/epoch_2_loss_0.09150885790586472_top1_0.9067796610169492.pth'

In [21]:
!tl predict-using-model -o siamese_prediction \
--ranking_model $ranking_model_file_path \
--normalization_factor $min_max_scaler_path $lof_feature_file > $output_model_pred_file

### Get Top 5 links

In [22]:
!tl get-kg-links -c $final_score_column -k 5 --k-rows $output_model_pred_file > $top5_links

In [23]:
pd.set_option('display.max_rows', None)
final_output = pd.read_csv(top5_links)
final_output[['column', 'row', 'label', 'context', 'kg_id', 'kg_labels', 'kg_aliases',
             'kg_descriptions', 'siamese_prediction']]

Unnamed: 0,column,row,label,context,kg_id,kg_labels,kg_aliases,kg_descriptions,siamese_prediction
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Q213854,Virat Kohli,Cheeku,Indian cricket player,1.0
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Q4792485,Armaan Kohli,,Indian actor,0.0001384772
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Q19843060,Rahul Kohli,,British actor,4.890681e-06
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Q17306158,Rochak Kohli,,Indian musician,7.318446e-07
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Q7260793,Purab Kohli,,Indian actor,5.572112e-07
5,0,1,Tendulkar,mumbai indians|137|24/04/1973,Q9488,Sachin Tendulkar,Master Blaster|Sachin Ramesh Tendulkar,Indian former cricketer,0.9998897
6,0,1,Tendulkar,mumbai indians|137|24/04/1973,Q3630378,Priya Tendulkar,,Marathi actress and social activist,9.283524e-07
7,0,1,Tendulkar,mumbai indians|137|24/04/1973,Q55744,Vijay Tendulkar,Vijay Dhondopant Tendulkar,Indian writer,7.993235e-07
8,0,1,Tendulkar,mumbai indians|137|24/04/1973,Q7645792,Suresh Tendulkar,,Indian economist,5.513932e-07
9,0,1,Tendulkar,mumbai indians|137|24/04/1973,Q22327439,Arjun Tendulkar,,cricketer,3.88088e-07


### Colorized KG Links file

In [24]:
!tl add-color -c $final_score_column -k 5 $top5_links --output $colorized_kg_links