In [None]:
# Import necessary libraries for Spark and DataFrame operations
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
# Create a Spark session with AWS S3 configurations
spark = SparkSession.builder \
    .appName("Data Cleaning") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

In [None]:
# Print the application name and Spark context information
print(spark.sparkContext.appName)
print(spark)

Data Cleaning
<pyspark.sql.session.SparkSession object at 0x79b02598cbd0>


In [None]:
# Get the Spark context from the Spark session
sc = spark.sparkContext

In [None]:
# Display the version of Spark
sc.version

'3.5.0'

In [None]:
# Display the Python version used by Spark
sc.pythonVer

'3.11'

In [None]:
# Display the master URL of the Spark cluster
sc.master

'local[*]'

In [None]:
# Import necessary types for DataFrame schema definition
from pyspark.sql.types import *

### News Data

In [None]:
# Define the schema for the news data DataFrame
data_schema = StructType([
    StructField("No", StringType(), True),
    StructField("Date", StringType(), True),
    StructField("Article_title", StringType(), True),
    StructField("Stock_symbol", StringType(), True),
    StructField("Url", StringType(), True),
    StructField("Publisher", StringType(), True),
    StructField("Author", StringType(), True),
    StructField("Article", StringType(), True),
    StructField("Lsa_summary", StringType(), True),
    StructField("Luhn_summary", StringType(), True),
    StructField("Textrank_summary", StringType(), True),
    StructField("Lexrank_summary", StringType(), True)
])

In [None]:
# Define the S3 path for the news data CSV file
bucket = 'fnf-bucket' 
object_path = 'bronze/stock_news/nasdaq_exteral_data.csv' 
s3a_path = f's3a://{bucket}/{object_path}'

In [None]:
# Load the news data from S3 into a DataFrame with the specified schema
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("escape", "\"") \
    .option("multiline", "true") \
    .schema(data_schema) \
    .load(s3a_path)

In [None]:
# Print the schema of the loaded DataFrame
df.printSchema()

root
 |-- No: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Article_title: string (nullable = true)
 |-- Stock_symbol: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- Author: string (nullable = true)
 |-- Article: string (nullable = true)
 |-- Lsa_summary: string (nullable = true)
 |-- Luhn_summary: string (nullable = true)
 |-- Textrank_summary: string (nullable = true)
 |-- Lexrank_summary: string (nullable = true)



In [None]:
# Drop unnecessary columns from the DataFrame
df = df.drop(F.col('Publisher'), F.col('Author'), F.col('Luhn_summary'), F.col('Textrank_summary'), F.col('Lexrank_summary'))

In [None]:
# Print the schema after dropping columns
df.printSchema()

root
 |-- No: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Article_title: string (nullable = true)
 |-- Stock_symbol: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Article: string (nullable = true)
 |-- Lsa_summary: string (nullable = true)



In [None]:
# Import additional libraries for data manipulation
import pandas as pd
import requests
from io import StringIO

In [None]:
# Define the URL to fetch S&P 500 companies data
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

In [None]:
# Install the lxml library for parsing HTML
! pip install lxml



In [None]:
# Fetch the S&P 500 companies data from the URL
response = requests.get(url, headers=headers)
tables = pd.read_html(StringIO(response.text))

In [None]:
# Extract the first table from the fetched data
sp500_table = tables[0]

In [None]:
# Create a list of stock symbols from the S&P 500 table
stock_list = sp500_table['Symbol'].to_list()

In [None]:
# Filter the news DataFrame to include only S&P 500 stocks
sp500_df = df.where(F.col('Stock_symbol').isin(stock_list))

In [None]:
# Cast columns to appropriate types for further processing
sp500_df = sp500_df.withColumn(
    "No", F.col("No").cast("integer")
).withColumn(
    "Date", F.to_date(F.col("Date"), "yyyy-MM-dd HH:mm:ss z")
)

In [None]:
# Drop rows with null values from the DataFrame
df_clean = sp500_df.dropna()

In [None]:
# Print the schema of the cleaned DataFrame
df_clean.printSchema()

root
 |-- No: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Article_title: string (nullable = true)
 |-- Stock_symbol: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Article: string (nullable = true)
 |-- Lsa_summary: string (nullable = true)



In [None]:
# Write the cleaned DataFrame to S3 in Parquet format, partitioned by stock symbol
df_clean.write \
    .mode('overwrite') \
    .option('compression', 'snappy') \
    .partitionBy('Stock_symbol') \
    .parquet('s3a://fnf-bucket/silver/news_data_clean')

### Stocks Price Data

In [None]:
# Define the input path for stock price data
input_path = "s3a://fnf-bucket/bronze/stock_price/full_history/*.csv"

In [None]:
# Load stock price data from S3 into a DataFrame
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(input_path)

In [6]:
# Normalize column names (handle case sensitivity and spaces)
for col_name in df.columns:
    df = df.withColumnRenamed(col_name, col_name.strip())

In [None]:
# Select and cast columns by name for further processing
df = df.select(
    F.to_date(F.col("Date"), "yyyy-MM-dd").alias("Date"),
    F.col("Open").cast(DoubleType()).alias("Open"),
    F.col("High").cast(DoubleType()).alias("High"),
    F.col("Low").cast(DoubleType()).alias("Low"),
    F.col("Close").cast(DoubleType()).alias("Close"),
    F.col("Adj Close").cast(DoubleType()).alias("Adj Close"),
    F.col("Volume").cast(LongType()).alias("Volume")
)

In [None]:
# Add stock symbol from filename to the DataFrame
df = df.withColumn(
    "Stock_symbol",
    F.upper(
        F.regexp_extract(
            F.input_file_name(),
            r"([^/]+)\.csv$",
            1
        )
    )
)

In [None]:
# Write the stock price DataFrame to S3 in Parquet format, partitioned by stock symbol
df.write \
    .mode("overwrite") \
    .partitionBy("Stock_symbol") \
    .parquet("s3a://fnf-bucket/silver/stock_price_data")

In [None]:
# Stop the Spark session
spark.stop()