In [1]:
import pandas as pd
import rltk
import re

wikicom = pd.read_csv("wikidata.csv", encoding='utf8')
jobscom = pd.read_csv("merged_jobs.csv", encoding='utf8')

In [2]:
# remove "(United States)" from the wiki company name
# lowercase
wikiList = [company.split(" (United States)")[0].lower() for company in wikicom['URILabel']]
jobsList = [company.lower() for company in jobscom['Company Name']]

In [3]:
wikicom['URILabel'] = wikiList
jobscom['Company Name'] = jobsList

In [4]:
# remove duplicates
wikidf = wikicom[['URILabel', 'URI', 'comURI']]
wikidf.drop_duplicates(keep='first',inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
len(wikidf)

19351

In [6]:
jobscom.columns

Index(['Company Name', 'Competitors', 'Founded', 'Headquarters', 'Industry',
       'Job Description', 'Job Title', 'Location', 'Rating', 'Revenue',
       'Salary Estimate', 'Sector', 'Size', 'Type of ownership', 'timestamp',
       'SE/DS', 'Website', 'Company Description'],
      dtype='object')

In [7]:
# remove duplicates
companydf = jobscom[['Company Name', 'Founded', 'Headquarters', 'Website']]
companydf.drop_duplicates(keep='first',inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
len(companydf)

1144

In [9]:
# sampling 100 company records (0.1%) from jobs sites 
companySample = companydf.sample(n=100)
companySample.sort_values(by= 'Company Name', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last')

In [10]:
wikidf[:5]

Unnamed: 0,URILabel,URI,comURI
0,1-800 contacts,http://www.wikidata.org/entity/Q4545758,http://www.contacts.com
1,1-800-free-411,http://www.wikidata.org/entity/Q4545749,http://www.marchex.com/
2,1-800-get-thin,http://www.wikidata.org/entity/Q4545753,
3,1000memories,http://www.wikidata.org/entity/Q4546003,http://1000memories.com/
4,10q reports,http://www.wikidata.org/entity/Q43374781,https://www.10qreports.com


In [11]:
companySample[:5]

Unnamed: 0,Company Name,Founded,Headquarters,Website
9,abb,1891.0,"Zurich, Switzerland",http://www.abb.com/
13,act consulting,-1.0,"Owatonna, MN",-1
27,adyen n.v.,-1.0,-1,-1
29,agc biologics,-1.0,"Bothell, WA",-1
1347,"agema technology, inc",,,


In [12]:
wikidf.to_csv(r'wikidf.csv', index = True, header=True)
companySample.to_csv(r'companySample.csv', index = True, header=True)

# make ground truth file manually with above 2 files

In [13]:
import json
groundTruth = pd.read_csv("groundTruth.csv", encoding='utf8')
groundTruth.fillna(0, inplace =True)
with open('company-wikiurl.json') as f:
    prediction = json.load(f)

In [14]:
y_ground = list()
y_pred = list()
for i in range(len(groundTruth)):
    gt = groundTruth.iloc[i]
    if gt['wikiURI'] == 0:
        y_ground.append(0)
        if gt['Company Name'] in prediction:
            y_pred.append(1)   # false postive
        else:
            y_pred.append(0)   # true negative
        
    else:
        y_ground.append(1)
        if gt['Company Name'] in prediction:
            if gt['wikiURI'] == prediction[gt['Company Name']]: # true positive
                y_pred.append(1)
            else:
                y_pred.append(0)  # false negative 
                
        else: # false negative
            y_pred.append(0)    

In [15]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

cm = confusion_matrix(y_ground, y_pred) 
  
print('Confusion Matrix :')
print(cm) 
print('Accuracy Score :',accuracy_score(y_ground, y_pred))
print('Report : ')
print(classification_report(y_ground, y_pred))

Confusion Matrix :
[[74  0]
 [10 16]]
Accuracy Score : 0.9
Report : 
              precision    recall  f1-score   support

           0       0.88      1.00      0.94        74
           1       1.00      0.62      0.76        26

   micro avg       0.90      0.90      0.90       100
   macro avg       0.94      0.81      0.85       100
weighted avg       0.91      0.90      0.89       100

