In [None]:
# Import necessary libraries for Spark and SQL functions
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
# Create a Spark session with AWS S3 configurations
spark = SparkSession.builder \
    .appName("Data Cleaning") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

In [None]:
# Read the cleaned news data from S3 in Parquet format
df = spark.read.parquet('s3a://fnf-bucket/silver/news_data_clean')

In [None]:
# Print the schema of the DataFrame to understand its structure
df.printSchema()

root
 |-- No: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Article_title: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Article: string (nullable = true)
 |-- Lsa_summary: string (nullable = true)
 |-- Stock_symbol: string (nullable = true)



In [None]:
# Select specific columns for further processing
df_finbert = df.select(F.col("No"), F.col("Lsa_summary"))

In [None]:
# Write the processed DataFrame back to S3 in Parquet format with Snappy compression
df_finbert.write \
    .mode('overwrite') \
    .option('compression', 'snappy') \
    .parquet('s3a://fnf-bucket/silver/data_for_finbert')

In [None]:
# Count the number of records in the final DataFrame
df_finbert.count()

729144