# Setting up the Knowledge Graph Datasets

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, concat
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()
from dataframe_functions import *

## Download the Datasets

In [2]:
#jobs
![ ! -d 'jobs' ] && git clone --depth=1 https://github.com/ai-powered-search/jobs.git
! cd jobs && git pull
! cd jobs && mkdir -p '../../data/jobs/' && tar -xvf jobs.tgz -C '../../data/jobs/'    

#health
![ ! -d 'health' ] && git clone --depth=1 https://github.com/ai-powered-search/health.git
! cd health && git pull
! cd health && mkdir -p '../../data/health/' && tar -xvf health.tgz -C '../../data/health/'

#scifi
![ ! -d 'scifi' ] && git clone --depth=1 https://github.com/ai-powered-search/scifi.git
! cd scifi && git pull
! cd scifi && mkdir -p '../../data/scifi/' && tar -xvf scifi.tgz -C '../../data/scifi/' 

#cooking
![ ! -d 'cooking' ] && git clone --depth=1 https://github.com/ai-powered-search/cooking.git
! cd cooking && git pull
! cd cooking && mkdir -p '../../data/cooking/' && tar -xvf cooking.tgz -C '../../data/cooking/'

#travel
![ ! -d 'travel' ] && git clone --depth=1 https://github.com/ai-powered-search/travel.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/travel/' && tar -xvf travel.tgz -C '../../data/travel/'

#devops
![ ! -d 'devops' ] && git clone --depth=1 https://github.com/ai-powered-search/devops.git
! cd devops && git pull
! cd devops && mkdir -p '../../data/devops/' && tar -xvf devops.tgz -C '../../data/devops/'

Cloning into 'jobs'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4 (delta 0), reused 4 (delta 0), pack-reused 0[K
Receiving objects: 100% (4/4), 47.24 MiB | 9.27 MiB/s, done.
Already up to date.
._jobs.csv
jobs.csv
Cloning into 'health'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 4 (delta 0), reused 3 (delta 0), pack-reused 0[K
Receiving objects: 100% (4/4), 6.57 MiB | 9.30 MiB/s, done.
Already up to date.
._posts.csv
posts.csv
Cloning into 'scifi'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 4 (delta 0), reused 4 (delta 0), pack-reused 0[K
Receiving objects: 100% (4/4), 85.09 MiB | 10.38 MiB/s, done.
Already up to date.
._posts.csv
posts.csv
Cloning into 'c

## Index the Jobs Dataset into the Search Engine

In [3]:
jobs_collection = engine.create_collection("jobs")
jobs_collection.write(from_csv("../data/jobs/jobs.csv", {"category": jobs_collection.name}))

Wiping "jobs" collection
Creating "jobs" collection
Status: Success
Loading jobs
jobs Schema: 
root
 |-- job_title: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- job_type: string (nullable = true)
 |-- category: string (nullable = false)
 |-- job_location: string (nullable = true)
 |-- job_city: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- job_country: string (nullable = true)
 |-- job_zip_code: string (nullable = true)
 |-- job_address: string (nullable = true)
 |-- min_salary: string (nullable = true)
 |-- max_salary: string (nullable = true)
 |-- salary_period: string (nullable = true)
 |-- apply_url: string (nullable = true)
 |-- apply_email: string (nullable = true)
 |-- num_employees: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_email: string (nullable = true)
 |-- company_website: string (nullable = true)
 |-- company_phone: string (nullable = tru

## Index StackExchange datasets: health, scifi, cooking, travel

In [4]:
se_collection = engine.create_collection("stackexchange")

datasets = ["health", "cooking", "scifi", "travel", "devops"]
for dataset in datasets:
    file = f"../data/{dataset}/posts.csv"
    se_collection.write(from_csv(file, {"category": dataset}))
    collection = engine.create_collection(dataset)
    collection.write(from_csv(file, {"category": dataset}))

Wiping "stackexchange" collection
Creating "stackexchange" collection
Status: Success
Loading stackexchange
stackexchange Schema: 
root
 |-- post_type_id: integer (nullable = true)
 |-- accepted_answer_id: integer (nullable = true)
 |-- parent_id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- body: string (nullable = true)
 |-- owner_user_id: integer (nullable = true)
 |-- owner_display_name: string (nullable = true)
 |-- last_editor_user_id: integer (nullable = true)
 |-- last_editor_display_name: string (nullable = true)
 |-- last_edit_date: timestamp (nullable = true)
 |-- last_activity_date: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- answer_count: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)

## Success!

Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.

Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)