In [1]:
import pandas as pd
import rltk
import re

wikicom = pd.read_csv("wikidata.csv", encoding='utf8')
jobscom = pd.read_csv("merged_jobs.csv", encoding='utf8')

In [2]:
# remove "(United States)" from the company name

wikiList = [company.split(" (United States)")[0].lower() for company in wikicom['URILabel']]

In [3]:
wikicom['URILabel'] = wikiList

In [4]:
wikicom.sort_values(by= 'URILabel', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')
wikicom.reset_index(drop=True, inplace=True)

In [5]:
wikicom.fillna(-1, inplace=True)

In [6]:
comList = list(jobscom['Company Name'])
wikiList = list(wikicom['URILabel'])
comset = sorted(set(comList))
wikiset = sorted(set(wikiList))

In [7]:
simDic = dict()
for com1 in comset:
    for com2 in wikiset:
        if com1[0] == com2[0]:
            sim = rltk.jaro_winkler_similarity(com1, com2)
            if sim > 0.8:
                simDic.update({(com1, com2): sim})
        elif com1[0] < com2[0]:
            break

In [8]:
match = dict()
matchedKey = list()
for key in simDic:
    if simDic[key] == 1:
        key0 = key[0]
        match.update({key0:wikicom.iloc[wikiList.index(key0)][0]})
        matchedKey.append(key0)

In [9]:
new_dict = {key:val for key, val in simDic.items() if key[0] not in matchedKey and key[1] not in matchedKey} 

In [10]:
len(new_dict)

3274

In [11]:
def getCleanName (key):    
    newkey = re.sub(r" corporation", "", key)
    newkey = re.sub(r" incorporation", "", newkey)
    newkey = re.sub(r"[.,]", "",newkey)
    newkey = re.sub(r" llc", "", newkey)
    newkey = re.sub(r" inc", "", newkey)
    newkey = re.sub(r" corp", "", newkey)
    newkey = re.sub(r" co", "", newkey)
    newkey = re.sub(r"&", "", newkey)
    newkey = re.sub(r" and ", " ", newkey)
    newkey = re.sub(r"  ", " ", newkey)
    newkey = re.sub(r"technologies", "technology", newkey)
    newkey = re.sub(r"laboratories", "labs", newkey)    
    return newkey

In [12]:
matchedKey0 = list()
matchedKey1 = list()
for key in new_dict:
    key0 = key[0]
    key1 = key[1]
    key2 = getCleanName(key[0])
    key3 = getCleanName(key[1])
    sim = rltk.jaro_winkler_similarity(key2, key3)
    if sim == 1:
        match.update({key0: wikicom.iloc[wikiList.index(key1)][0]}) 
        matchedKey0.append(key0)
        matchedKey1.append(key1)

In [13]:
len(match)

128

In [14]:
new_dict = {key:val for key, val in new_dict.items() if key[0] not in matchedKey0 and key[1] not in matchedKey1} 

In [15]:
len(new_dict)

3131

In [16]:
def getURL (url):
    if url !=-1:
        newurl = re.sub(r".com", "", url)
        newurl = re.sub(r".org", "", url)
        newurl = re.sub(r"www.", "", newurl)
        newurl = re.sub(r"http:", "", newurl)
        newurl = re.sub(r"https:", "", newurl)
        newurl = re.sub(r"[:/.]", "", newurl)
        return newurl
    else:
        return -1

In [17]:
# website == comURI, and name similarity > 0.8
matchedKey0 = list()
matchedKey1 = list()
for key in new_dict:
    key0 = key[0]
    key1 = key[1]
    website = getURL(jobscom.iloc[comList.index(key0)][1])
    comURI = getURL(wikicom.iloc[wikiList.index(key1)][2])    
    if website == comURI and website !=-1:
        match.update({key0: wikicom.iloc[wikiList.index(key1)][0]}) 
        matchedKey0.append(key0)
        matchedKey1.append(key1)

In [18]:
len(match)

128

In [19]:
new_dict = {key:val for key, val in new_dict.items() if key[0] not in matchedKey0 and key[1] not in matchedKey1} 

In [20]:
len(new_dict)

3131

In [21]:
cnt = 0
for key, val in new_dict.items():
    if val >= 0.9:
        cnt +=1
        print(key, val)


('acxiom', 'axiom') 0.9500000000000001
('acxiom', 'axiomx') 0.9
('alin', 'albion') 0.9111111111111111
('amica mutual', 'amica mutual insurance') 0.9090909090909091
('amrock', 'amerock') 0.9619047619047619
('applied memetics llc', 'applied stemcell') 0.9349999999999999
('applied research laboratories', 'applied behavioral research') 0.9015989782886334
('applied research laboratories', 'applied research associates') 0.9182950191570881
('applied research laboratories', 'applied research corporation') 0.9119458128078818
('applied research laboratories', 'applied science laboratories') 0.9029852216748768
('armis', 'artemis') 0.9238095238095239
('arrow electronics', 'argo electric') 0.9010558069381599
('assured information security, inc. (ais)', 'assured information security') 0.9400000000000001
('attainment company', 'attainment') 0.9111111111111111
('audible', 'audible.com') 0.9272727272727272
('audible', 'audiofile') 0.9047619047619048
('avant', 'advanta') 0.9142857142857143
('avant', 'av

('digitalocean', 'digital ocean'), ('peoplesbank', 'peoples bank'), ('nestle', 'nestlé'), 
('takeda pharmaceuticals', 'takeda pharmaceuticals north america')
('trilogy ed', 'trilogy')
('universal technologies (ny)', 'universal technology corporation')

key = matchedKey[12]
print(wikicom.iloc[wikiList.index(key)][0])
print(wikicom.iloc[wikiList.index(key)][1])
print(wikicom.iloc[wikiList.index(key)][2])
print(jobscom.iloc[comList.index(key)][0])
print(jobscom.iloc[comList.index(key)][1])

In [22]:
match

{'23andme': 'http://www.wikidata.org/entity/Q216272',
 'abb': 'http://www.wikidata.org/entity/Q30338853',
 'acceleron pharma': 'http://www.wikidata.org/entity/Q16824238',
 'accuride international': 'http://www.wikidata.org/entity/Q4672917',
 'adient': 'http://www.wikidata.org/entity/Q26832896',
 'adobe': 'http://www.wikidata.org/entity/Q11463',
 'advanced bioscience laboratories': 'http://www.wikidata.org/entity/Q30253430',
 'agios pharmaceuticals': 'http://www.wikidata.org/entity/Q30286564',
 'altec': 'http://www.wikidata.org/entity/Q30255008',
 'amazon': 'http://www.wikidata.org/entity/Q3884',
 'amyris': 'http://www.wikidata.org/entity/Q4749580',
 'associated banc-corp': 'http://www.wikidata.org/entity/Q4809155',
 'assurant': 'http://www.wikidata.org/entity/Q4810399',
 'asurion': 'http://www.wikidata.org/entity/Q4811938',
 'autodesk': 'http://www.wikidata.org/entity/Q628051',
 'axon': 'http://www.wikidata.org/entity/Q30254995',
 'biogen': 'http://www.wikidata.org/entity/Q864338',
 'b

In [23]:
uris = [val for key, val in match.items()] 

In [24]:
rowList = list()
for i in range(len(wikicom)):
    if wikicom.iloc[i]['URI'] in uris:
        rowList.append(i)

In [25]:
len(rowList)

281

In [26]:
df = wikicom.iloc[rowList]

In [27]:
df
df.reset_index(drop=True, inplace=True)

In [28]:
import json
f = open("company-wikiurl.json", mode = "w+")
json.dump(match, f)
f.close()

In [29]:
df.to_csv(r'wikidata2.csv', index = False, header=True)

In [30]:
df

Unnamed: 0,URI,URILabel,comURI,founded_by,founded_byLabel,CEO,CEOLabel,parent,parentLabel,subsidiary,subsidiaryLabel,owned_by,owned_byLabel,owner_of,owner_ofLabel,total_revenue
0,http://www.wikidata.org/entity/Q216272,23andme,https://www.23andme.com,http://www.wikidata.org/entity/Q2069573,Anne Wojcicki,http://www.wikidata.org/entity/Q2069573,Anne Wojcicki,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
1,http://www.wikidata.org/entity/Q216272,23andme,https://www.23andme.com,http://www.wikidata.org/entity/Q2069744,Linda Avey,http://www.wikidata.org/entity/Q2069573,Anne Wojcicki,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
2,http://www.wikidata.org/entity/Q216272,23andme,https://www.23andme.com,http://www.wikidata.org/entity/Q13377793,Paul Cusenza,http://www.wikidata.org/entity/Q2069573,Anne Wojcicki,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
3,http://www.wikidata.org/entity/Q30338853,abb,http://new.abb.com/us,-1,-1,-1,-1,http://www.wikidata.org/entity/Q52825,ABB Group,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
4,http://www.wikidata.org/entity/Q16824238,acceleron pharma,http://www.acceleron.com/,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
5,http://www.wikidata.org/entity/Q4672917,accuride international,http://www.accuride.com/,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
6,http://www.wikidata.org/entity/Q26832896,adient,http://www.adient.com,-1,-1,-1,-1,-1,-1,http://www.wikidata.org/entity/Q52853020,Adient Holding Czech Republic,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
7,http://www.wikidata.org/entity/Q26832896,adient,http://www.adient.com,-1,-1,-1,-1,-1,-1,http://www.wikidata.org/entity/Q52855351,Adient Bor,-1.0,-1.0,-1.0,-1.0,-1.000000e+00
8,http://www.wikidata.org/entity/Q11463,adobe,https://www.adobe.com/,http://www.wikidata.org/entity/Q1344392,John Warnock,http://www.wikidata.org/entity/Q2622748,Shantanu Narayen,-1,-1,http://www.wikidata.org/entity/Q16927817,Adobe Marketing Cloud,-1.0,-1.0,-1.0,-1.0,3.800000e+09
9,http://www.wikidata.org/entity/Q11463,adobe,https://www.adobe.com/,http://www.wikidata.org/entity/Q2959209,Charles Geschke,http://www.wikidata.org/entity/Q2622748,Shantanu Narayen,-1,-1,http://www.wikidata.org/entity/Q16927817,Adobe Marketing Cloud,-1.0,-1.0,-1.0,-1.0,3.800000e+09
