In [1]:
# input files: companyRDF.csv, wikidata2.csv, cleaned_jobs.csv
# output file: companyJobsRDF.ttl
import pandas as pd
import re

df = pd.read_csv("companyRDF.csv")
df.fillna("missing!", inplace=True)

In [2]:
from rdflib import Graph, URIRef, Literal, XSD, Namespace
from rdflib.namespace import RDF

In [3]:
# http://www.ldf.fi/service/rdf-grapher

MYNS = Namespace('uri:xinting_myunghee:')
schema = Namespace('http://schema.org/')
my_kg = Graph()
my_kg.bind('my_ns', MYNS)
my_kg.bind('rdf', RDF)
my_kg.bind('schema', schema)
my_kg.bind('xsd', XSD)

In [4]:
wikidf = pd.read_csv("wikidata2.csv")

In [5]:
remove = {"-1"}
remove2 = {("-1", "-1")}
def getwiki(idxes):
    founder = set()
    parentOrganization = set()
    subOrganization = set()
    CEO = set()

    for idx in idxes:
        founder.add(wikidf.iloc[idx]["founded_byLabel"])
        parentOrganization.add((wikidf.iloc[idx]["parent"], wikidf.iloc[idx]["parentLabel"]))
        subOrganization.add((wikidf.iloc[idx]["subsidiary"], wikidf.iloc[idx]["subsidiaryLabel"]))
        CEO.add(wikidf.iloc[idx]["CEOLabel"])
    
    founder -= remove
    parentOrganization -= remove2 
    subOrganization -= remove2
    CEO -= remove 
        
    return founder, parentOrganization, subOrganization, CEO 

In [6]:
df.columns

Index(['Company Name', 'Competitors', 'Founded', 'Headquarters', 'Rating',
       'Revenue', 'Sector', 'Size', 'Type of ownership', 'Website',
       'Company Description', 'wikisite', 'uri_name'],
      dtype='object')

In [7]:
wikidf.columns

Index(['URI', 'URILabel', 'comURI', 'founded_by', 'founded_byLabel', 'CEO',
       'CEOLabel', 'parent', 'parentLabel', 'subsidiary', 'subsidiaryLabel',
       'total_revenue'],
      dtype='object')

In [8]:
# company RDF
for i in range(len(df)):
    node_uri = URIRef(MYNS[df.iloc[i]["uri_name"]])
    my_kg.add((node_uri, RDF.type, MYNS['company']))

    my_kg.add((node_uri, schema['name'], Literal(df.iloc[i]['Company Name'])))    
   
    if df.iloc[i]['Competitors'] != "missing!":
        my_kg.add((node_uri, MYNS['competitors'], Literal(df.iloc[i]['Competitors'])))
    
    if df.iloc[i]['Founded'] != "missing!":
        my_kg.add((node_uri, schema['foundingYear'], Literal(int(df.iloc[i]['Founded']))))

    if df.iloc[i]['Headquarters'] != "missing!":
        my_kg.add((node_uri, MYNS['headquarters'], Literal(df.iloc[i]['Headquarters'])))

    if df.iloc[i]['Rating'] != "missing!":
        my_kg.add((node_uri, schema['aggregateRating'], Literal(df.iloc[i]['Rating'])))

    if df.iloc[i]['Revenue'] != "missing!":
        my_kg.add((node_uri, MYNS['revenue'], Literal(df.iloc[i]['Revenue'])))

    if df.iloc[i]['Sector'] != "missing!":
        my_kg.add((node_uri, MYNS['sector'], Literal(df.iloc[i]['Sector'])))
    
    if df.iloc[i]['Size'] != "missing!":        
        my_kg.add((node_uri, schema['numberOfEmployees'], Literal(df.iloc[i]['Size'])))
    
    if df.iloc[i]['Type of ownership'] != "missing!":
        my_kg.add((node_uri, MYNS['ownershipType'], Literal(df.iloc[i]['Type of ownership'])))
    
    if df.iloc[i]['Company Description'] != "missing!":   
        my_kg.add((node_uri, schema['description'], Literal(df.iloc[i]['Company Description'])))
    
    wikisite = df.iloc[i]['wikisite']
    if wikisite != "missing!":  
        my_kg.add((node_uri, schema['sameAs'], Literal(wikisite)))
        idxes = wikidf.index[wikidf["URI"] == wikisite].tolist()
        
        if df.iloc[i]['Website'] != "missing!":
            my_kg.add((node_uri, schema['url'], Literal(df.iloc[i]['Website'])))
        else:
            comURI = wikidf.iloc[idxes[0]]["comURI"]
            if comURI != "-1":
                my_kg.add((node_uri, schema['url'], Literal(comURI)))
        
        founder, parentOrganization, subOrganization, CEO = getwiki(idxes)
        
        for fo in founder:
            my_kg.add((node_uri, schema['founder'], Literal(fo)))
        
        for ce in CEO:
            my_kg.add((node_uri, MYNS['ceo'], Literal(ce)))
            
        for paURI, parent in parentOrganization:
            parent_uri = URIRef(paURI)
            my_kg.add((node_uri, schema['parentOrganization'], parent_uri))
            my_kg.add((parent_uri, schema['name'], Literal(parent)))
        
        for subURI, sub in subOrganization:
            sub_uri = URIRef(subURI)
            my_kg.add((node_uri, schema['subOrganization'], sub_uri))
            my_kg.add((sub_uri, schema['name'], Literal(sub)))

In [9]:
# job RDF: industry, job description, job title, location, salary est, company, skills (job description)
jobdf = pd.read_csv("cleaned_jobs.csv")
jobdf.fillna("missing!", inplace=True)

In [10]:
jobdf.columns

Index(['Company Name', 'Competitors', 'Founded', 'Headquarters', 'Industry',
       'Job Description', 'Job Title', 'Location', 'Rating', 'Revenue',
       'Salary Estimate', 'Sector', 'Size', 'Type of ownership', 'timestamp',
       'SE/DS', 'Website', 'Company Description', 'wikisite', 'uri_name'],
      dtype='object')

In [11]:
selectCol = list(set(jobdf.columns) - set(df.columns))

In [12]:
selectCol

['Salary Estimate',
 'SE/DS',
 'Location',
 'Industry',
 'Job Description',
 'timestamp',
 'Job Title']

In [13]:
selectCol.remove('timestamp')
selectCol.append('uri_name')

In [14]:
selectCol

['Salary Estimate',
 'SE/DS',
 'Location',
 'Industry',
 'Job Description',
 'Job Title',
 'uri_name']

In [15]:
jobdf2 = jobdf[selectCol]

In [16]:
joburl = ["job"+str(i) for i in range(len(jobdf2))]
jobdf2['joburl'] = joburl

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [17]:
jobdf2.columns

Index(['Salary Estimate', 'SE/DS', 'Location', 'Industry', 'Job Description',
       'Job Title', 'uri_name', 'joburl'],
      dtype='object')

In [18]:
# jobs RDF
for i in range(len(jobdf2)):
    node_uri = URIRef(MYNS[jobdf2.iloc[i]["joburl"]])
    my_kg.add((node_uri, RDF.type, MYNS['job']))
    
    my_kg.add((node_uri, MYNS['job_title'], Literal(jobdf2.iloc[i]['Job Title'])))
    description = jobdf2.iloc[i]['Job Description'] 
    if description != "missing!":
        description = re.sub(r"[\r\n]", "", description)
        my_kg.add((node_uri, schema['description'], Literal(description)))
    
    if jobdf2.iloc[i]['Salary Estimate'] != "missing!":
        my_kg.add((node_uri, schema['salary_estimate'], Literal(jobdf2.iloc[i]['Salary Estimate'])))    
    
    my_kg.add((node_uri, MYNS['se_ds'], Literal(jobdf2.iloc[i]['SE/DS'])))    
   
    if jobdf2.iloc[i]['Location'] != "missing!":
        my_kg.add((node_uri, schema['location'], Literal(jobdf2.iloc[i]['Location'])))

    if jobdf2.iloc[i]['Industry'] != "missing!":
        my_kg.add((node_uri, MYNS['industry'], Literal(jobdf2.iloc[i]['Industry'])))
       
    company_uri = URIRef(MYNS[jobdf2.iloc[i]["uri_name"]])
    my_kg.add((node_uri, MYNS['company'], company_uri))

In [19]:
my_kg.serialize('companyJobsRDF.ttl', format="turtle")