In [0]:
from datetime import datetime as dt
from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

class SimpleDataSource(DataSource):
    """
    A simple data source for PySpark that generates exactly two rows of synthetic data.
    """

    @classmethod
    def name(cls):
        return "simple"

    def schema(self):
        return """
            symbol STRING,
            time TIMESTAMP,
            open FLOAT,
            high FLOAT,
            low FLOAT,
            close FLOAT,
            volume INT,
            trade_count INT,
            vwap FLOAT
        """

    def reader(self, schema: StructType):
        return SimpleDataSourceReader()

class SimpleDataSourceReader(DataSourceReader):

    def read(self, partition):
        test_bars = [
            ('AAPL', dt.fromisoformat('2021-01-04T05:00:00Z'), 133.52, 133.6116, 126.76, 129.41, 158211374, 1310229, 129.717982),
            ('AAPL', dt.fromisoformat('2021-01-05T05:00:00Z'), 128.89, 131.74, 128.43, 131.01, 105863439, 707583, 130.738233),
            ('AAPL', dt.fromisoformat('2021-01-06T05:00:00Z'), 127.72, 131.0499, 126.382, 126.66, 165568781, 1202579, 128.249403),
            ('AAPL', dt.fromisoformat('2021-01-07T05:00:00Z'), 128.36, 131.63, 127.86, 130.92, 118743769, 718362, 130.185457),
            ('AAPL', dt.fromisoformat('2021-01-08T05:00:00Z'), 132.43, 132.63, 130.23, 132.05, 112696090, 798393, 131.580087)
        ]
        for bar in test_bars:
            yield bar


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

spark.dataSource.register(SimpleDataSource)

In [0]:
spark.read.format("simple").load().show()