## Data Ingestion

In [6]:
!pip install pyspark requests



In [12]:
from pyspark.sql import SparkSession
import requests
import pandas as pd
from datetime import datetime, timedelta

spark = SparkSession.builder.appName("IEXCloudIngestion").getOrCreate()

# Define tickers
tickers = ["AAPL", "MSFT", "AMZN", "GOOGL", "META", "NVDA", "TSLA", "INTC", "CRM", "ADBE"]

# Your IEX Cloud API Key
API_KEY = 'pk_e274e451b38f4d64b1c19dd3b1c0314c'

# Placeholder for PySpark DataFrames
dfs = []

# Base URL for intra-day data. Here, we're assuming there's a way to fetch minute data by specifying a date.
# This is hypothetical since the exact endpoint may differ.
BASE_URL = "https://cloud.iexapis.com/stable/stock/{}/chart/date/{}?token={}"

for ticker in tickers:
    # First, fetch max history to get date range
    response_max = requests.get(f"https://cloud.iexapis.com/stable/stock/{ticker}/chart/max?token={API_KEY}")
    
    if response_max.status_code == 200:
        max_data = response_max.json()

        # Extract date range (this is hypothetical, you'll need to adjust based on the actual API response structure)
        start_date = datetime.strptime(max_data[0]['date'], '%Y-%m-%d')
        end_date = datetime.strptime(max_data[-1]['date'], '%Y-%m-%d')

        current_date = start_date
        while current_date <= end_date:
            # For each date, fetch minute-by-minute data
            response = requests.get(BASE_URL.format(ticker, current_date.strftime('%Y%m%d'), API_KEY))
            
            if response.status_code == 200:
                data_json = response.json()
                
                if data_json:  # Check if data is not empty
                    # Convert to Pandas DataFrame
                    data_pd = pd.DataFrame(data_json)
                    
                    # Add a column for the ticker
                    data_pd['Ticker'] = ticker

                    # Convert to PySpark DataFrame
                    data_spark = spark.createDataFrame(data_pd)
                    dfs.append(data_spark)
            else:
                print(f"Failed to fetch minute data for {ticker} on {current_date.strftime('%Y-%m-%d')}")

            # Move to next date
            current_date += timedelta(days=1)
    else:
        print(f"Failed to fetch max history for {ticker}")

# Union all individual dataframes to create a single PySpark DataFrame
final_df = dfs[0]
for df in dfs[1:]:
    final_df = final_df.union(df)

final_df.show()


KeyboardInterrupt: 

In [10]:
num_rows = final_df.count()
print(num_rows)




40158


                                                                                

In [None]:
final_df.write.csv("stock_data.csv", header=True, mode="overwrite")
