# Setting up the Knowledge Graph Datasets

In [None]:
import sys

sys.path.append('..')

from aips import get_engine
from aips.spark.dataframe import from_csv
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("AIPS").getOrCreate()
engine = get_engine()

## Download the Datasets

In [None]:
#jobs
![ ! -d 'jobs' ] && git clone --depth=1 https://github.com/ai-powered-search/jobs.git
! cd jobs && git pull
! cd jobs && mkdir -p '../../data/jobs/' && tar -xvf jobs.tgz -C '../../data/jobs/'    

#health
![ ! -d 'health' ] && git clone --depth=1 https://github.com/ai-powered-search/health.git
! cd health && git pull
! cd health && mkdir -p '../../data/health/' && tar -xvf health.tgz -C '../../data/health/'

#scifi
![ ! -d 'scifi' ] && git clone --depth=1 https://github.com/ai-powered-search/scifi.git
! cd scifi && git pull
! cd scifi && mkdir -p '../../data/scifi/' && tar -xvf scifi.tgz -C '../../data/scifi/' 

#cooking
![ ! -d 'cooking' ] && git clone --depth=1 https://github.com/ai-powered-search/cooking.git
! cd cooking && git pull
! cd cooking && mkdir -p '../../data/cooking/' && tar -xvf cooking.tgz -C '../../data/cooking/'

#travel
![ ! -d 'travel' ] && git clone --depth=1 https://github.com/ai-powered-search/travel.git
! cd travel && git pull
! cd travel && mkdir -p '../../data/travel/' && tar -xvf travel.tgz -C '../../data/travel/'

#devops
![ ! -d 'devops' ] && git clone --depth=1 https://github.com/ai-powered-search/devops.git
! cd devops && git pull
! cd devops && mkdir -p '../../data/devops/' && tar -xvf devops.tgz -C '../../data/devops/'

## Index the Jobs Dataset into the Search Engine

In [None]:
jobs_collection = engine.create_collection("jobs")
jobs_collection.write(from_csv("../data/jobs/jobs.csv", {"category": jobs_collection.name}))

## Index StackExchange datasets: health, scifi, cooking, travel

In [None]:
se_collection = engine.create_collection("stackexchange")

datasets = ["health", "cooking", "scifi", "travel", "devops"]
for dataset in datasets:
    file = f"../data/{dataset}/posts.csv"
    se_collection.write(from_csv(file, {"category": dataset}))
    collection = engine.create_collection(dataset)
    collection.write(from_csv(file, {"category": dataset}))

## Success!

Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.

Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)