In [1]:
import pandas as pd
import similaripy as sim
from scipy import *
from scipy.sparse import *
from tqdm import tqdm
import numpy as np

import re
import string as string_lib
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def ngrams(string, n=3):
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower() #make lower case
    chars_to_remove = [")","(","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string) #remove the list of chars defined above
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single space
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-/]',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [5]:
ngrams('ciao')

[' ci', 'cia', 'iao', 'ao ']

In [6]:
# first load the data
df_train = pd.read_csv("../dataset/original/train.csv", escapechar="\\")
df_test = pd.read_csv("../dataset/original/test.csv", escapechar="\\")
# ALWAYS sort the data by record_id
df_train = df_train.sort_values(by=['record_id']).reset_index(drop=True)
df_test = df_test.sort_values(by=['record_id']).reset_index(drop=True)

In [8]:
df_train.name = df_train.name.astype(str)
df_test.name = df_test.name.astype(str)

In [10]:
corpus_train = [ngrams(x) for x in tqdm(df_train.name.tolist())]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 691440/691440 [00:07<00:00, 96437.22it/s]


In [11]:
corpus_test = [ngrams(x) for x in tqdm(df_test.name.tolist())]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 266955/266955 [00:02<00:00, 96040.17it/s]


In [12]:
from rank_bm25 import BM25Okapi

In [13]:
bm25 = BM25Okapi(corpus_train)

In [14]:
df_test.head()

Unnamed: 0,record_id,name,type,address,phone,email,modification
0,10000003-TST-MR,"HOTFOCUS CO., LTD.",entity,,440157400000.0,consulting@outlook.ch,move unique
1,10000008-TST-M,BONUS TRADE LIMITED,entity,,,help@gmail.gov,move row
2,10000010-TST-CP,NEW IDEA LIMITED,entity,,19124690000.0,,identical copy
3,10000013-TST-MR,VICTORY GROUP LIMITED,entity,,19495060000.0,,move unique
4,10000016-TST-MR,"NINGBO RAPID INTERNATIONAL TRADING CO., LTD.",entity,,444651200000.0,info@outlook.cz,move unique


In [30]:
m = ngrams('VICTORY GROUP LIMITED')
bm25.get_top_n(m, corpus_train, n=1)

[[' wi',
  'win',
  'in ',
  'n v',
  ' vi',
  'vic',
  'ict',
  'cto',
  'tor',
  'ory',
  'ry ',
  'y g',
  ' gr',
  'gro',
  'rou',
  'oup',
  'up ',
  'p l',
  ' li',
  'lim',
  'imi',
  'mit',
  'ite',
  'ted',
  'ed ']]

In [25]:
df_train.name = df_train.name.str.lower()

In [29]:
df_train[df_train.name.str.startswith('ningbo e')]

Unnamed: 0,record_id,name,type,address,phone,email,modification,linked_id
1062,10000596,"ningbo everwin int'l co., ltd.",entity,,338712700000.0,help@Ningbo.Everwin.Intl.Co.Ltd.de,original,10000596
9644,10005311,ningbo enlly trade limited,entity,,3981359000.0,,original,10005311
9645,10005311-M0,ningbo enlly trade limited,entity,,3981359000.0,,missing,10005311
9646,10005311-M1,ningbo enlly trade limited,entity,,,,missing,10005311
19655,10010930,"ningbo e.t.d. hotfocus co., ltd.",entity,,420739400000.0,,original,10010930
19656,10010930-NV0,"ningbo e.t.d. hotfocus co., ltd.",entity,AKARA BUILDING 24; DE CASTRO STREET; WICKHAM'S...,332737300000.0,,new_value,10010930
19657,10010930-NV1,"ningbo e.t.d. hotfocus co., ltd.",entity,Humahuaca 4647 - 8p; Buenos Aires; Argentina,332737300000.0,,new_value,10010930
19659,10010930-T1,"ningbo e.t.d. hotfocus co., ltd.",entity,,420739400000.0,,typo,10010930
33198,10018485,"ningbo easy houseware co., ltd.",entity,,448995000000.0,,original,10018485
33199,10018485-NV0,"ningbo easy houseware co., ltd.",entity,,448995000000.0,,new_value,10018485


In [7]:
# mi serve una colonna con tutti i telefoni su cui fare tfidf
all_phones = list(df_train.phone) + list(df_test.phone)
# daje con tfidf
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(all_phones)

In [8]:
# split
tf_idf_train = tf_idf_matrix[:691440,:] # 691440 è la lunghezza del train
tf_idf_test = tf_idf_matrix[691440:,:]
jac = sim.jaccard(tf_idf_test, tf_idf_train.T, k=300)
save_npz('jaccard_tfidf_phone.npz', jac.tocsr())

Computing:   8%|████▏                                                   | 20254/266955 [01:12<14:44, 278.92it/s]

KeyboardInterrupt: 