### Large Data Ingestion

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("BigDataAnalyticsPipeline") \
    .getOrCreate()

# Define the base directory for the project
base_dir = "./Project"

# Gather paths to all text files that are actually JSON
file_paths = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        # Add only text files (assuming they don’t have a .json extension)
        if not file.endswith('.json'):
            file_paths.append(os.path.join(root, file))

# Load all JSON text files into a single Spark DataFrame
# Using the option `multiLine=True` if each file contains multiple JSON objects across lines
df = spark.read.option("multiLine", True).json(file_paths)

# Display schema and sample records to verify

df.printSchema()
df.show(5)


24/11/09 16:20:17 WARN Utils: Your hostname, Thanadetchs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.204.73.83 instead (on interface en0)
24/11/09 16:20:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/09 16:20:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/09 16:20:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/11/09 16:21:34 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


root
 |-- abstracts-retrieval-response: struct (nullable = true)
 |    |-- affiliation: string (nullable = true)
 |    |-- authkeywords: struct (nullable = true)
 |    |    |-- author-keyword: string (nullable = true)
 |    |-- authors: struct (nullable = true)
 |    |    |-- author: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- @_fa: string (nullable = true)
 |    |    |    |    |-- @auid: string (nullable = true)
 |    |    |    |    |-- @seq: string (nullable = true)
 |    |    |    |    |-- affiliation: string (nullable = true)
 |    |    |    |    |-- author-url: string (nullable = true)
 |    |    |    |    |-- ce:alias: string (nullable = true)
 |    |    |    |    |-- ce:alt-name: string (nullable = true)
 |    |    |    |    |-- ce:degrees: string (nullable = true)
 |    |    |    |    |-- ce:given-name: string (nullable = true)
 |    |    |    |    |-- ce:indexed-name: string (nullable = true)
 |    |    |    |    |

                                                                                

+----------------------------+
|abstracts-retrieval-response|
+----------------------------+
|        {[{"affiliation-c...|
|        {[{"affiliation-c...|
|        {[{"affiliation-c...|
|        {[{"affiliation-c...|
|        {[{"affiliation-c...|
+----------------------------+
only showing top 5 rows



### Data Transformation

In [5]:

# Pandas -> Spark -> export csv -> power BI
selected_df = df.select(
    col("abstracts-retrieval-response.item.bibrecord.head.citation-title").alias("citation-title"),
)
selected_df.show(5)

+--------------------+
|      citation-title|
+--------------------+
|Guidelines for th...|
|A multi-dimension...|
|Measurement of el...|
|Evidence for WW p...|
|Search for a heav...|
+--------------------+
only showing top 5 rows

