# Setting up the Knowledge Graph Datasets

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, concat
spark = SparkSession.builder.appName("ch5").getOrCreate()

## Download the Datasets

In [2]:
#jobs
![ ! -d 'jobs' ] && git clone --depth=1 https://github.com/ai-powered-search/jobs.git
! cd jobs && git pull
! cd jobs && mkdir -p '../../data/jobs/' && tar -xvf jobs.tgz -C '../../data/jobs/'    

#health
![ ! -d 'health' ] && git clone --depth=1 https://github.com/ai-powered-search/health.git
! cd health && git pull
! cd health && mkdir -p '../../data/health/' && tar -xvf health.tgz -C '../../data/health/'

#scifi
![ ! -d 'scifi' ] && git clone --depth=1 https://github.com/ai-powered-search/scifi.git
! cd scifi && git pull
! cd scifi && mkdir -p '../../data/scifi/' && tar -xvf scifi.tgz -C '../../data/scifi/' 

#cooking
![ ! -d 'cooking' ] && git clone --depth=1 https://github.com/ai-powered-search/cooking.git
! cd cooking && git pull
! cd cooking && mkdir -p '../../data/cooking/' && tar -xvf cooking.tgz -C '../../data/cooking/'

#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../../data/outdoors/' && tar -xvf outdoors.tgz -C '../../data/outdoors/'

#outdoors
![ ! -d 'travel' ] && git clone --depth=1 https://github.com/ai-powered-search/travel.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/travel/' && tar -xvf travel.tgz -C '../../data/travel/'

#devops
![ ! -d 'devops' ] && git clone --depth=1 https://github.com/ai-powered-search/devops.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/devops/' && tar -xvf travel.tgz -C '../../data/devops/'

Cloning into 'jobs'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 6 (delta 0), reused 3 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.
Already up to date.
._jobs.csv
jobs.csv
Cloning into 'health'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 3 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.
Already up to date.
._posts.csv
posts.csv
Cloning into 'scifi'...
remote: Enumerating objects: 6, done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 6[K
Unpacking objects: 100% (6/6), done.
Already up to date.
._posts.csv
posts.csv
Cloning into 'cooking'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (del

## Index the Jobs Dataset into the Search Engine

In [3]:
#Create Jobs Collection
jobs_collection="jobs"
create_collection(jobs_collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(jobs_collection, "company_country")
upsert_text_field(jobs_collection, "job_description")
upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

print("Loading Jobs...")
csvFile = "../data/jobs/jobs.csv"
csvDF = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("charset", "utf-8") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiLine","true") \
    .option("delimiter", ",").load(csvFile)

jobs_update_opts={"zkhost": "aips-zk", "collection": jobs_collection, 
                  "gen_uniq_key": "true", "commit_within": "5000"}
csvDF.write.format("solr").options(**jobs_update_opts).mode("overwrite").save()
print("Jobs Schema: ")
csvDF.printSchema()
print("Status: Success")

Wiping 'jobs' collection
[('action', 'CREATE'), ('name', 'jobs'), ('numShards', 1), ('replicationFactor', 1)]
Creating 'jobs' collection
Status: Success
Adding 'company_country' field to collection
Status: Success
Adding 'job_description' field to collection
Status: Success
Adding 'company_description' field to collection
Status: Success
Loading Jobs...
Jobs Schema: 
root
 |-- job_title: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- job_type: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job_location: string (nullable = true)
 |-- job_city: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- job_country: string (nullable = true)
 |-- job_zip_code: string (nullable = true)
 |-- job_address: string (nullable = true)
 |-- min_salary: string (nullable = true)
 |-- max_salary: string (nullable = true)
 |-- salary_period: string (nullable = true)
 |-- apply_url: string (nullable = true)
 |-- apply_email: string (nullab

## Index StackExchange datasets: health, scifi, cooking, outdoors, travel

In [4]:
def index_stack_exchange_dataset(collection, dataset):
    print(f"Loading '{dataset}' dataset into collection '{collection}'...")
    csvFile = "../data/" + dataset + "/posts.csv"
    csvDF = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("charset", "utf-8") \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .option("multiLine","true") \
        .option("delimiter", ",").load(csvFile)
        
    csvWithCategoryDF = csvDF.withColumn("category", lit(dataset))\
        .drop("id")
    # We can rely on automatic generation of IDs, or we can create them ourselves. 
    # If we do it, comment out previous line
    # .withColumn("id", concat(col("category"), lit("_") col("id")))
    
    update_opts={"zkhost": "aips-zk", "collection": collection, "gen_uniq_key": "true", "commit_within": "5000"}
    csvWithCategoryDF.write.format("solr").options(**update_opts).mode("overwrite").save()
    print("Status: Success")

In [5]:
collection="stackexchange"
create_collection(collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(collection, "title")
upsert_text_field(collection, "body")
    
index_stack_exchange_dataset(collection,"health")
index_stack_exchange_dataset(collection,"cooking")
index_stack_exchange_dataset(collection,"scifi")
index_stack_exchange_dataset(collection,"outdoors")
index_stack_exchange_dataset(collection,"travel")
index_stack_exchange_dataset(collection,"devops")


create_collection("health")
upsert_text_field("health", "title")
upsert_text_field("health", "body")
index_stack_exchange_dataset("health","health")

create_collection("cooking")
upsert_text_field("cooking", "title")
upsert_text_field("cooking", "body")
index_stack_exchange_dataset("cooking","cooking")

create_collection("scifi")
upsert_text_field("scifi", "title")
upsert_text_field("scifi", "body")
index_stack_exchange_dataset("scifi","scifi")

create_collection("outdoors")
upsert_text_field("outdoors", "title")
upsert_text_field("outdoors", "body")
index_stack_exchange_dataset("outdoors","outdoors")

create_collection("travel")
upsert_text_field("travel", "title")
upsert_text_field("travel", "body")
index_stack_exchange_dataset("travel","travel")

create_collection("devops")
upsert_text_field("devops", "title")
upsert_text_field("devops", "body")
index_stack_exchange_dataset("devops","devops")

Wiping 'stackexchange' collection
[('action', 'CREATE'), ('name', 'stackexchange'), ('numShards', 1), ('replicationFactor', 1)]
Creating 'stackexchange' collection
Status: Success
Adding 'title' field to collection
Status: Success
Adding 'body' field to collection
Status: Success
Loading 'health' dataset into collection 'stackexchange'...
Status: Success
Loading 'cooking' dataset into collection 'stackexchange'...
Status: Success
Loading 'scifi' dataset into collection 'stackexchange'...
Status: Success
Loading 'outdoors' dataset into collection 'stackexchange'...
Status: Success
Loading 'travel' dataset into collection 'stackexchange'...
Status: Success
Loading 'devops' dataset into collection 'stackexchange'...
Status: Success
Wiping 'health' collection
[('action', 'CREATE'), ('name', 'health'), ('numShards', 1), ('replicationFactor', 1)]
Creating 'health' collection
Status: Success
Adding 'title' field to collection
Status: Success
Adding 'body' field to collection
Status: Success
L

## Success!

Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.

Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)