# Inverted Index using Spark

In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, split, col, explode
sc = pyspark.SparkContext(appName="InvertedIndex")

In [2]:
sc

In [3]:
# This allows us to load csvs and text files easily with spark.read.csv(path_to_file)

spark = SparkSession.builder.getOrCreate()

In [4]:
!hdfs dfs -mkdir /stack

mkdir: `/stack': File exists


In [5]:
!sudo hdfs dfs -copyFromLocal QueryResults.csv /stack

In [6]:
!hdfs dfs -ls /stack

Found 1 items
-rw-r--r--   1 Administrators None    6443164 2022-06-26 17:55 /stack/QueryResults.csv


2022-06-26 18:42:24,786 WARN nativeio.NativeIO: NativeIO.getStat error (3): The system cannot find the path specified.
 -- file path: stack/QueryResults.csv


In [7]:
!hdfs dfs -head /stack/QueryResults.csv

Id,Tags,Title,CreationDate
"70546323","<html><notepad++><right-to-left>","Why does Notepad++ display question marks for a language other than English?","2022-01-01 00:01:24"
"70546338","<javascript><arrays><react-native>","Object Push into Array within For Loop/For Each Loop is causing duplicates","2022-01-01 00:05:55"
"70546410","<python><contextmanager>","Should the ""opening work"" of a context manager happen in __init__ or __enter__?","2022-01-01 00:31:32"
"70546446","<arrays><c><struct><embedded><game-boy-advance>","error: expected '=', ',', ';', 'asm' or '__attribute__' before '.' token","2022-01-01 00:41:51"
"70546448","<python><pip>","I've tried to use this python package I installed but I get this error","2022-01-01 00:42:23"
"70546468","<python><tensorflow><tensor>","tensorflow: convert a list of tensor to ragged tensor with a fixed dim in a certain axis","2022-01-01 00:49:24"
"70546484","<node.js><graph-theory><depth-first-search><directed-acyclic-graphs>","List all ""unique

In [8]:
overflow_csv = spark.read.option("header",True).csv('/stack/QueryResults.csv')
overflow_csv

DataFrame[Id: string, Tags: string, Title: string, CreationDate: string]

In [9]:
overflow_csv.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- CreationDate: string (nullable = true)



In [10]:
overflow_csv.take(2)

[Row(Id='70546323', Tags='<html><notepad++><right-to-left>', Title='Why does Notepad++ display question marks for a language other than English?', CreationDate='2022-01-01 00:01:24'),
 Row(Id='70546338', Tags='<javascript><arrays><react-native>', Title='Object Push into Array within For Loop/For Each Loop is causing duplicates', CreationDate='2022-01-01 00:05:55')]

In [11]:
inv_idx = overflow_csv.withColumn('Tags', regexp_replace('Tags', '<', '')) \
    .withColumn('Tags', split('Tags', '>')) \
    .withColumn('Tags', explode("Tags")) \
    .select('Tags', 'Id', 'Title') \
    .where(col('Tags') != '')

In [12]:
inv_idx.show()

+----------------+--------+--------------------+
|            Tags|      Id|               Title|
+----------------+--------+--------------------+
|            html|70546323|Why does Notepad+...|
|       notepad++|70546323|Why does Notepad+...|
|   right-to-left|70546323|Why does Notepad+...|
|      javascript|70546338|Object Push into ...|
|          arrays|70546338|Object Push into ...|
|    react-native|70546338|Object Push into ...|
|          python|70546410|"Should the ""ope...|
|  contextmanager|70546410|"Should the ""ope...|
|          arrays|70546446|error: expected '...|
|               c|70546446|error: expected '...|
|          struct|70546446|error: expected '...|
|        embedded|70546446|error: expected '...|
|game-boy-advance|70546446|error: expected '...|
|          python|70546448|I've tried to use...|
|             pip|70546448|I've tried to use...|
|          python|70546468|tensorflow: conve...|
|      tensorflow|70546468|tensorflow: conve...|
|          tensor|70

In [13]:
inv_idx.where(col('Tags') == 'html').show()

+----+--------+--------------------+
|Tags|      Id|               Title|
+----+--------+--------------------+
|html|70546323|Why does Notepad+...|
|html|70546530|How to change hei...|
|html|70546828|parsing urls usin...|
|html|70546837|Why does a defaul...|
|html|70546856|Chess Table Spaci...|
|html|70548298|Fixed header but ...|
|html|70548766|Get html table da...|
|html|70549018|How can i center ...|
|html|70549553|select only text ...|
|html|70550312|How to reuse a fu...|
|html|70550336|How to prevent br...|
|html|70550608|how to make js di...|
|html|70551072|How to center-ali...|
|html|70551271|CSS animation onl...|
|html|70557074|Is there a way to...|
|html|70557516|"Javascript docum...|
|html|70557826|"How i can use ""...|
|html|70557905|Change div style ...|
|html|70557992|Struggling with f...|
|html|70551566|Empty responsive ...|
+----+--------+--------------------+
only showing top 20 rows



In [14]:
inv_idx.where(col('Tags') == 'directed-acyclic-graphs').show()

+--------------------+--------+--------------------+
|                Tags|      Id|               Title|
+--------------------+--------+--------------------+
|directed-acyclic-...|70546484|"List all ""uniqu...|
|directed-acyclic-...|71095957|Can a snakemake r...|
|directed-acyclic-...|71131285|How to address a ...|
|directed-acyclic-...|71142612|Array of values a...|
+--------------------+--------+--------------------+



In [15]:
inv_idx.write.option('header', True).csv('/stack/InvertedIndex')