**INSTALL PYSPARK**

In [1]:
! pip install pyspark
! pip install requests
! pip install confluent_kafka



**START SPARK SESSION**

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("StockDataProject") \
    .getOrCreate()

**API REQUEST - BATCH PROCESSING**

In [3]:
import requests
import json

API_KEY = "C0YYA5Q74WFXPC45"
SYMBOL = "IBM"
INTERVAL = "30min"
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={SYMBOL}&interval={INTERVAL}&apikey={API_KEY}'

response = requests.get(url)
data = response.json()

print(data)
time_series_key = f"Time Series ({INTERVAL})"
if time_series_key not in data:
    print("Error: Invalid response or requests limit exceeded.")
else:
    print("Success!")

records = []
for timestamp, values in data[time_series_key].items():
    records.append((
        SYMBOL,
        timestamp,
        float(values["1. open"]),
        float(values["2. high"]),
        float(values["3. low"]),
        float(values["4. close"]),
        int(values["5. volume"])
    ))

columns = ["symbol", "timestamp", "open", "high", "low", "close", "volume"]
df = spark.createDataFrame(records, schema=columns)
df.show()

{'Meta Data': {'1. Information': 'Intraday (30min) open, high, low, close prices and volume', '2. Symbol': 'IBM', '3. Last Refreshed': '2025-02-05 19:30:00', '4. Interval': '30min', '5. Output Size': 'Compact', '6. Time Zone': 'US/Eastern'}, 'Time Series (30min)': {'2025-02-05 19:30:00': {'1. open': '263.5000', '2. high': '265.0000', '3. low': '262.7500', '4. close': '264.2000', '5. volume': '635'}, '2025-02-05 19:00:00': {'1. open': '263.3000', '2. high': '265.0000', '3. low': '263.1300', '4. close': '263.1300', '5. volume': '1846788'}, '2025-02-05 18:30:00': {'1. open': '263.3000', '2. high': '263.7900', '3. low': '263.3000', '4. close': '263.3000', '5. volume': '1846601'}, '2025-02-05 18:00:00': {'1. open': '263.5700', '2. high': '263.8000', '3. low': '262.7500', '4. close': '263.8000', '5. volume': '262'}, '2025-02-05 17:30:00': {'1. open': '263.8000', '2. high': '263.8000', '3. low': '263.0100', '4. close': '263.2600', '5. volume': '750'}, '2025-02-05 17:00:00': {'1. open': '262.9

**SEND DATA TO KAFKA**

In [4]:
from confluent_kafka import Producer
import json

producer_config = {
    'bootstrap.servers': 'localhost:9092',
    'client.id': 'api-producer'
}
producer = Producer(producer_config)

topic_name = 'stock_data_topic'

for row in df.collect():
    record = {
        "symbol": row["symbol"],
        "timestamp": row["timestamp"],
        "open": row["open"],
        "high": row["high"],
        "low": row["low"],
        "close": row["close"],
        "volume": row["volume"]
    }
    producer.produce(topic_name, value=json.dumps(record))

producer.flush()
print("Data sent to Kafka!")

Data sent to Kafka!


**CONSUME DATA FROM KAKFA USING PYSPARK**

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType

schema = StructType([
    StructField("symbol", StringType()),
    StructField("timestamp", TimestampType()),
    StructField("open", DoubleType()),
    StructField("high", DoubleType()),
    StructField("low", DoubleType()),
    StructField("close", DoubleType()),
    StructField("volume", IntegerType())
])

kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "stock_data_topic") \
    .load() \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json("value", schema).alias("data")) \
    .select("data.*")

def write_to_postgres(df, epoch_id):
    df.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://localhost:5432/mydatabase") \
        .option("dbtable", "stock_data") \
        .option("user", "root") \
        .option("password", "root") \
        .mode("append") \
        .save()

query = kafka_df.writeStream \
    .foreachBatch(write_to_postgres) \
    .start()

query.awaitTermination()

AnalysisException: Failed to find data source: kafka. Please deploy the application as per the deployment section of Structured Streaming + Kafka Integration Guide.

**QUERY DATA**

In [48]:
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5432/mydatabase") \
    .option("dbtable", "stock_data") \
    .option("user", "root") \
    .option("password", "root") \
    .load()

df.show(10)

Inserido: 2025-01-28 19:30
Inserido: 2025-01-28 19:00
Inserido: 2025-01-28 18:30
Inserido: 2025-01-28 18:00
Inserido: 2025-01-28 17:30
Inserido: 2025-01-28 17:00
Inserido: 2025-01-28 16:30
Inserido: 2025-01-28 16:00
Inserido: 2025-01-28 15:30
Inserido: 2025-01-28 15:00
Inserido: 2025-01-28 14:30
Inserido: 2025-01-28 14:00
Inserido: 2025-01-28 13:30
Inserido: 2025-01-28 13:00
Inserido: 2025-01-28 12:30
Inserido: 2025-01-28 12:00
Inserido: 2025-01-28 11:30
Inserido: 2025-01-28 11:00
Inserido: 2025-01-28 10:30
Inserido: 2025-01-28 10:00
Inserido: 2025-01-28 09:30
Inserido: 2025-01-28 09:00
Inserido: 2025-01-28 08:30
Inserido: 2025-01-28 08:00
Inserido: 2025-01-28 07:30
Inserido: 2025-01-28 07:00
Inserido: 2025-01-28 06:30
Inserido: 2025-01-28 06:00
Inserido: 2025-01-28 05:30
Inserido: 2025-01-28 05:00
Inserido: 2025-01-28 04:30
Inserido: 2025-01-28 04:00
Inserido: 2025-01-27 19:30
Inserido: 2025-01-27 19:00
Inserido: 2025-01-27 18:30
Inserido: 2025-01-27 18:00
Inserido: 2025-01-27 17:30
I

**FINISH SPARK SESSION**

In [55]:
spark.stop()

shape: (5, 8)
┌─────┬────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┬────────┐
│ id  ┆ symbol ┆ timestamp    ┆ open         ┆ high         ┆ low          ┆ close        ┆ volume │
│ --- ┆ ---    ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ ---    │
│ i64 ┆ str    ┆ datetime[μs] ┆ decimal[*,4] ┆ decimal[*,4] ┆ decimal[*,4] ┆ decimal[*,4] ┆ i64    │
╞═════╪════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════════╪════════╡
│ 800 ┆ IBM    ┆ 2025-01-23   ┆ 225.7500     ┆ 225.9800     ┆ 225.7500     ┆ 225.9800     ┆ 53     │
│     ┆        ┆ 18:00:00     ┆              ┆              ┆              ┆              ┆        │
│ 799 ┆ IBM    ┆ 2025-01-23   ┆ 226.0400     ┆ 226.0400     ┆ 225.7700     ┆ 225.7700     ┆ 690546 │
│     ┆        ┆ 18:30:00     ┆              ┆              ┆              ┆              ┆        │
│ 798 ┆ IBM    ┆ 2025-01-23   ┆ 226.0400     ┆ 226.0400     ┆ 225.8300     ┆ 

  df = pl.DataFrame(rows, schema=columns)
