# Setting up the Knowledge Graph Datasets

In [7]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.core.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
spark = SparkSession.builder.appName("ch5").getOrCreate()

## Download the Datasets

In [None]:
#jobs
![ ! -d 'jobs' ] && git clone https://github.com/ai-powered-search/jobs.git
! cd jobs && git pull
! cd jobs && mkdir -p '../../data/jobs/' && tar -xvf jobs.tgz -C '../../data/jobs/'    

#health
![ ! -d 'health' ] && git clone https://github.com/ai-powered-search/health.git
! cd health && git pull
! cd health && mkdir -p '../../data/health/' && tar -xvf health.tgz -C '../../data/health/'

#scifi
![ ! -d 'scifi' ] && git clone https://github.com/ai-powered-search/scifi.git
! cd scifi && git pull
! cd scifi && mkdir -p '../../data/scifi/' && tar -xvf scifi.tgz -C '../../data/scifi/' 

#cooking
![ ! -d 'cooking' ] && git clone https://github.com/ai-powered-search/cooking.git
! cd cooking && git pull
! cd cooking && mkdir -p '../../data/cooking/' && tar -xvf cooking.tgz -C '../../data/cooking/'

#outdoors
![ ! -d 'outdoors' ] && git clone https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && mkdir -p '../../data/outdoors/' && tar -xvf outdoors.tgz -C '../../data/outdoors/'

#outdoors
![ ! -d 'travel' ] && git clone https://github.com/ai-powered-search/travel.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/travel/' && tar -xvf travel.tgz -C '../../data/travel/'

#devops
![ ! -d 'devops' ] && git clone https://github.com/ai-powered-search/devops.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/devops/' && tar -xvf travel.tgz -C '../../data/devops/'

## Index the Jobs Dataset into the Search Engine

In [10]:
#Create Jobs Collection
jobs_collection="jobs"
create_collection(jobs_collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(jobs_collection, "company_country")
upsert_text_field(jobs_collection, "job_description")
upsert_text_field(jobs_collection, "company_description")
#upsert_text_field(products_collection, "longDescription")
#upsert_text_field(products_collection, "manufacturer")

print("Loading Jobs...")
csvFile = "../data/jobs/jobs.csv"
jobs_update_opts={"zkhost": "aips-zk", "collection": jobs_collection, "gen_uniq_key": "true", "commit_within": "5000"}
csvDF = spark.read.format("com.databricks.spark.csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("charset", "utf-8") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiLine","true") \
    .option("delimiter", ",").load(csvFile)
csvDF.write.format("solr").options(**jobs_update_opts).mode("overwrite").save()
print("Jobs Schema: ")
csvDF.printSchema()
print("Status: Success")

Wiping 'jobs' collection
Creating jobs' collection
Status: Success
Adding 'company_country' field to collection
Status: Success
Adding 'job_description' field to collection
Status: Success
Adding 'company_description' field to collection
Status: Success
Loading Jobs...
Jobs Schema: 
root
 |-- job_title: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- job_type: string (nullable = true)
 |-- category: string (nullable = true)
 |-- job_location: string (nullable = true)
 |-- job_city: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- job_country: string (nullable = true)
 |-- job_zip_code: string (nullable = true)
 |-- job_address: string (nullable = true)
 |-- min_salary: string (nullable = true)
 |-- max_salary: string (nullable = true)
 |-- salary_period: string (nullable = true)
 |-- apply_url: string (nullable = true)
 |-- apply_email: string (nullable = true)
 |-- num_employees: string (nullable = true)
 |-- industry: string (nullabl

## Index StackExchange datasets: health, scifi, cooking, outdoors, travel

In [3]:
def index_stack_exchange_dataset(collection,dataset):
    print("Loading '" + dataset + "' Dataset...")
    csvFile = "../data/" + dataset + "/posts.csv"
    update_opts={"zkhost": "aips-zk", "collection": collection, "gen_uniq_key": "true", "commit_within": "5000"}
    csvDF = spark.read.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("charset", "utf-8") \
        .option("quote", "\"") \
        .option("escape", "\"") \
        .option("multiLine","true") \
        .option("delimiter", ",").load(csvFile)
        
    csvWithCategoryDF = csvDF.withColumn("category", lit(dataset))
    
    csvWithCategoryDF.write.format("solr").options(**update_opts).mode("overwrite").save()
    print("Status: Success")

In [8]:
collection="stackexchange"
create_collection(collection)

#Modify Schema to make some fields explicitly searchable by keyword
upsert_text_field(collection, "title")
upsert_text_field(collection, "body")
    
index_stack_exchange_dataset(collection,"health")
index_stack_exchange_dataset(collection,"cooking")
index_stack_exchange_dataset(collection,"scifi")
index_stack_exchange_dataset(collection,"outdoors")
index_stack_exchange_dataset(collection,"travel")
index_stack_exchange_dataset(collection,"devops")


create_collection("health")
upsert_text_field("health", "title")
upsert_text_field("health", "body")
index_stack_exchange_dataset("health","health")

create_collection("cooking")
upsert_text_field("cooking", "title")
upsert_text_field("cooking", "body")
index_stack_exchange_dataset("cooking","cooking")

create_collection("scifi")
upsert_text_field("scifi", "title")
upsert_text_field("scifi", "body")
index_stack_exchange_dataset("scifi","scifi")

create_collection("outdoors")
upsert_text_field("outdoors", "title")
upsert_text_field("outdoors", "body")
index_stack_exchange_dataset("outdoors","outdoors")

create_collection("travel")
upsert_text_field("travel", "title")
upsert_text_field("travel", "body")
index_stack_exchange_dataset("travel","travel")

create_collection("devops")
upsert_text_field("devops", "title")
upsert_text_field("devops", "body")
index_stack_exchange_dataset("devops","devops")

Wiping 'stackexchange' collection
Creating stackexchange' collection
Status: Success
Adding 'title' field to collection
Status: Success
Adding 'body' field to collection
Status: Success
Loading 'health' Dataset...
Status: Success
Loading 'cooking' Dataset...
Status: Success
Loading 'scifi' Dataset...
Status: Success
Loading 'outdoors' Dataset...
Status: Success
Loading 'travel' Dataset...
Status: Success
Loading 'devops' Dataset...
Status: Success
Wiping 'health' collection
Creating health' collection
Status: Success
Adding 'title' field to collection
Status: Success
Adding 'body' field to collection
Status: Success
Loading 'health' Dataset...
Status: Success
Wiping 'cooking' collection
Creating cooking' collection
Status: Success
Adding 'title' field to collection
Status: Success
Adding 'body' field to collection
Status: Success
Loading 'cooking' Dataset...
Status: Success
Wiping 'scifi' collection
Creating scifi' collection
Status: Success
Adding 'title' field to collection
Status: S

## Success!

Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.

Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)