In [1]:
# Doing an initial test of Spark to make sure it works.
import findspark
findspark.init()
import pyspark

In [2]:
pyspark.__version__

'3.5.0'

In [3]:
#Getting all files in the directory
import os
import re

path = 'data2/data/fulldata/'
files = os.listdir(path)
files = [path+f for f in files]
#print(files)

In [4]:
#Removing all \t from the files
for f in files:
    #print(f)
    lines = ''
    with open(f, 'r', encoding="utf8") as file:
        lines = file.readline()
        #print(lines)
        lines = re.sub(r'\t', ' ', lines)
        lines = lines.split()[0] + "\t" + ' '.join(lines.split()[1:])
        file.close()
    with open(f, 'w', encoding="utf8") as file:
        file.write(lines)
        file.close()

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode, col, concat_ws, collect_list
from pyspark.sql.types import StringType, ArrayType, StructType, StructField

# Creating a Spark session
spark = SparkSession.builder.appName("InvertedIndex").getOrCreate()

In [6]:
spark

In [7]:
data = spark.read.text(files).rdd.map(lambda x: x.value)
print("Data read successfullly!!!")

def clean_and_tokenize(text):
    # Replacing \t, special characters, and numerals with space
    processed_text = re.sub(r'\t', ' ', text)    
    processed_text = re.sub(r'[^a-zA-Z]+', ' ', processed_text)

    processed_text = processed_text.lower()
    processed_text = re.sub(r'\s+', ' ', processed_text)
    words = processed_text.split()

    return words

# Defining a UDF for the cleaning and tokenizing function
clean_and_tokenize_udf = udf(clean_and_tokenize, ArrayType(StringType()))

# Split the data into docId and text
split_data = data.map(lambda x: x.split('\t')).map(lambda x: (x[0], x[1]))

# Creating a Spark DataFrame
schema = StructType([StructField("docId", StringType(), True), StructField("text", StringType(), True)])
df = spark.createDataFrame(split_data, schema=schema)

# Applying the cleaning and tokenizing UDF to the 'text' column
df = df.withColumn("words", clean_and_tokenize_udf(df["text"]))
df_exploded = df.select("docId", "words").withColumn("word", explode("words"))

# Group by word, docId, and count the occurrences
inverted_index = df_exploded.groupBy("word", "docId").count()
inverted_index = inverted_index.withColumn("count", col("count").cast("string"))
inverted_index = inverted_index.withColumn("output", concat_ws(":", "docId", "count"))

# Group by word and collect list of docId: count values
result = inverted_index.groupBy("word").agg(collect_list("output").alias("output_list"))
print(result.count())
#result.show(5, truncate=False)
#print(result.describe())

df_text = result.select(concat_ws(" ", *result.columns).alias("text"))
#df_text.show(15, truncate=False)
#print(df_text.describe())
tmp = df_text.toPandas()
print(tmp.head())

Data read successfullly!!!
12623
                                                text
0  a 5722018444:28 5722018440:44 5722018443:33 57...
1                                    aa 5722018435:1
2                                  aacr 5722018445:1
3                                 aacsb 5722018483:2
4                                  aamc 5722018479:3


In [8]:
import csv
tmp.to_csv('unigram_index.txt', sep=' ', index=False, header=False, quoting=3, escapechar=' ')

In [9]:
spark.stop()