# Setting up the Knowledge Graph Datasets

In [1]:
import sys
sys.path.append('..')
from aips import *
import os
from IPython.display import display,HTML
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, col, concat
spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Download the Datasets

In [2]:
#jobs
![ ! -d 'jobs' ] && git clone --depth=1 https://github.com/ai-powered-search/jobs.git
! cd jobs && git pull
! cd jobs && mkdir -p '../../data/jobs/' && tar -xvf jobs.tgz -C '../../data/jobs/'    

#health
![ ! -d 'health' ] && git clone --depth=1 https://github.com/ai-powered-search/health.git
! cd health && git pull
! cd health && mkdir -p '../../data/health/' && tar -xvf health.tgz -C '../../data/health/'

#scifi
![ ! -d 'scifi' ] && git clone --depth=1 https://github.com/ai-powered-search/scifi.git
! cd scifi && git pull
! cd scifi && mkdir -p '../../data/scifi/' && tar -xvf scifi.tgz -C '../../data/scifi/' 

#cooking
![ ! -d 'cooking' ] && git clone --depth=1 https://github.com/ai-powered-search/cooking.git
! cd cooking && git pull
! cd cooking && mkdir -p '../../data/cooking/' && tar -xvf cooking.tgz -C '../../data/cooking/'

#outdoors
![ ! -d 'outdoors' ] && git clone --depth=1 https://github.com/ai-powered-search/outdoors.git
! cd outdoors && git pull
! cd outdoors && cat outdoors.tgz.part* > outdoors.tgz
! cd outdoors && mkdir -p '../../data/outdoors/' && tar -xvf outdoors.tgz -C '../../data/outdoors/'

#outdoors
![ ! -d 'travel' ] && git clone --depth=1 https://github.com/ai-powered-search/travel.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/travel/' && tar -xvf travel.tgz -C '../../data/travel/'

#devops
![ ! -d 'devops' ] && git clone --depth=1 https://github.com/ai-powered-search/devops.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/devops/' && tar -xvf travel.tgz -C '../../data/devops/'

Already up to date.
._jobs.csv
jobs.csv
Already up to date.
._posts.csv
posts.csv
Already up to date.
._posts.csv
posts.csv
Already up to date.
._posts.csv
posts.csv
Already up to date.
README.md
concepts.pickle
._guesses.csv
guesses.csv
._guesses_all.json
guesses_all.json
outdoors_concepts.pickle
outdoors_embeddings.pickle
._outdoors_golden_answers.csv
outdoors_golden_answers.csv
._outdoors_golden_answers.xlsx
outdoors_golden_answers.xlsx
._outdoors_golden_answers_20210130.csv
outdoors_golden_answers_20210130.csv
outdoors_labels.pickle
outdoors_question_answering_contexts.json
outdoors_questionanswering_test_set.json
outdoors_questionanswering_train_set.json
._posts.csv
posts.csv
predicates.pickle
pull_aips_dependency.py
._question-answer-seed-contexts.csv
question-answer-seed-contexts.csv
question-answer-squad2-guesses.csv
._roberta-base-squad2-outdoors
roberta-base-squad2-outdoors/
roberta-base-squad2-outdoors/._tokenizer_config.json
roberta-base-squad2-outdoors/tokenizer_config.jso

## Index the Jobs Dataset into the Search Engine

In [3]:
jobs_collection = engine.create_collection("jobs")
jobs_collection.write_from_csv("../data/jobs/jobs.csv", {"category": jobs_collection})

Wiping 'jobs' collection
Status: Success
Creating 'jobs' collection
Status: Success
Loading jobs
jobs Schema: 
root
 |-- job_title: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- job_type: string (nullable = true)
 |-- category: string (nullable = false)
 |-- job_location: string (nullable = true)
 |-- job_city: string (nullable = true)
 |-- job_state: string (nullable = true)
 |-- job_country: string (nullable = true)
 |-- job_zip_code: string (nullable = true)
 |-- job_address: string (nullable = true)
 |-- min_salary: string (nullable = true)
 |-- max_salary: string (nullable = true)
 |-- salary_period: string (nullable = true)
 |-- apply_url: string (nullable = true)
 |-- apply_email: string (nullable = true)
 |-- num_employees: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_email: string (nullable = true)
 |-- company_website: string (nullable = true)
 |-- company_phone: string

## Index StackExchange datasets: health, scifi, cooking, outdoors, travel

In [4]:
se_collection = engine.create_collection("stackexchange")

collections = ["health", "cooking", "scifi", "outdoors", "travel", "devops"]
for collection_name in collections:
    file = f"../data/{collection_name}/posts.csv"
    se_collection.write_from_csv(file, {"category":c})
    collection = engine.create_collection(collection_name)
    collection.write_from_csv(file, {"category":c})

Wiping 'stackexchange' collection
Status: Success
Creating 'stackexchange' collection
Status: Success
Loading stackexchange
stackexchange Schema: 
root
 |-- post_type_id: integer (nullable = true)
 |-- accepted_answer_id: integer (nullable = true)
 |-- parent_id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- body: string (nullable = true)
 |-- owner_user_id: integer (nullable = true)
 |-- owner_display_name: string (nullable = true)
 |-- last_editor_user_id: integer (nullable = true)
 |-- last_editor_display_name: string (nullable = true)
 |-- last_edit_date: timestamp (nullable = true)
 |-- last_activity_date: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- answer_count: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- favorite_count: integer (

## Success!

Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.

Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)