### Environment setup

In [1]:
pip install graphframes ipywidgets pyarrow pyvis==0.3.1 networkx matplotlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Cell 2 — make sure Python can see your module, and import the bits you need
import sys
from pathlib import Path

# adjust this to wherever you put your module
MODULE_DIR = Path.home()/"Final Project"/"modules"
sys.path.append(str(MODULE_DIR))

import keyword_search_module as ksm
import publication_search_module as psm

# bring functions into local namespace if you like
from keyword_search_module import initialize_spark, search_papers_widget, build_graph_widget, build_graph_from_keywords
from publication_search_module import build_publication_graph_widget

In [2]:
# Cell 3 — start Spark and distribute our module (so that ksm.spark & ksm.sc will be available, and set PROJECT_ROOT)
from pathlib import Path
import keyword_search_module  # notebook driver can already see it

# initialize Spark
spark, sc = initialize_spark(driver_memory="6g", shuffle_partitions=32)

# ship the module to executors
MODULE_DIR = Path.home()/"Final Project"/"modules"
MODULE_FILE = MODULE_DIR/"keyword_search_module.py"
MODULE_FILE2 = MODULE_DIR/"publication_search_module.py"
sc.addPyFile(str(MODULE_FILE))
sc.addPyFile(str(MODULE_FILE2))  

# set up PROJECT_ROOT and checkpointing

PROJECT_ROOT_KEYWORD_SEARCH = Path("/home/jovyan/Final Project/keyword_search")
PROJECT_ROOT_KEYWORD_SEARCH.mkdir(exist_ok=True, parents=True)
sc.setCheckpointDir(str(PROJECT_ROOT_KEYWORD_SEARCH/"checkpoints"))

PROJECT_ROOT_PUBLICATION_SEARCH = Path("/home/jovyan/Final Project/publication_search")
PROJECT_ROOT_PUBLICATION_SEARCH.mkdir(exist_ok=True, parents=True)
# (optional) sc.setCheckpointDir(str(PROJECT_ROOT_PUBLICATION_SEARCH/"checkpoints"))
psm.PROJECT_ROOT = PROJECT_ROOT_PUBLICATION_SEARCH



# make PROJECT_ROOT available inside the module too
import keyword_search_module as m; m.PROJECT_ROOT = PROJECT_ROOT_KEYWORD_SEARCH


:: loading settings :: url = jar:file:/opt/conda/envs/bigdata/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-562d7bdd-f5a5-4601-8956-821fba7d89f5;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.1-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 551ms :: artifacts dl 14ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.1-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	--------------------------------

In [3]:
# Cell 5: import and distribute paper-ID module
import sys
from pathlib import Path
# set up PROJECT_ROOT and checkpointing

PROJECT_ROOT_PAPER_ID_SEARCH = Path("/home/jovyan/Final Project/paper_id_search")
PROJECT_ROOT_PAPER_ID_SEARCH.mkdir(exist_ok=True, parents=True)
sc.setCheckpointDir(str(PROJECT_ROOT_PAPER_ID_SEARCH/"checkpoints"))

MODULE_DIR = Path.home()/"Final Project"/"modules"
MODULE_FILE_PAPER_ID_SEARCH = MODULE_DIR/"paper_id_search_module.py"
sc.addPyFile(str(MODULE_FILE_PAPER_ID_SEARCH))

import paper_id_search_module as pidm
from paper_id_search_module import build_id_graph_widget
pidm.PROJECT_ROOT = PROJECT_ROOT_PAPER_ID_SEARCH


### Widget for Keyword based search

In [4]:
# Cell 4 — build & visualize the graph via the module’s widget UI
build_graph_widget(spark, sc)

VBox(children=(Text(value='', description='Keywords:'), IntSlider(value=50, description='Max Papers:', min=10,…

### Widget for paper id search

In [5]:
# Cell 6: show the graph-builder widget
build_id_graph_widget(spark, sc)

VBox(children=(Text(value='', description='IDs:', placeholder='e.g. 649def34f8be52c8b66281af98ae884c09aef38b, …

### Widget for Publication and year based search

In [6]:
# Cell 7 — build & visualize the graph via your new publication_search_module
build_publication_graph_widget(spark, sc)

VBox(children=(Text(value='', description='Publication:', placeholder='e.g. Nature'), IntText(value=2020, desc…