# [TEST] **sa_lr_pyspark_preprocessing:**

This test set focuses on functions related to data preprocessing in the sa_lr_pyspark_preprocessing module.  

The primary purpose of these tests is to verify that data cleaning and transformation operations are performed correctly using the functions provided in this module.  

Mocks are used to simulate Spark operations and ensure they are called correctly.  

Additionally, these tests seek to confirm that the resulting DataFrame is valid and not null.

In [None]:
import pytest
from pyspark.sql import SparkSession
from mock import Mock

from preprocessing.sa_lr_pyspark_preprocessing import pre_process

In [None]:
@pytest.fixture
def spark_session():
    # Set up a SparkSession for testing
    return SparkSession.builder.master("local[2]").appName("test").getOrCreate()

In [None]:
def test_pre_process(spark_session):
    # Create a test DataFrame
    data = [
        (1, "This is a sample tweet."),
        (0, "Another tweet with numbers 123."),
    ]
    columns = ["label", "text"]
    df = spark_session.createDataFrame(data, columns)

    # Mock for the Spark functions used in pre_process
    MockTokenizer = Mock()
    MockStopWordsRemover = Mock()
    MockCountVectorizer = Mock()
    MockIDF = Mock()

    # Replace the Spark functions with the mocks
    pre_process.tokenizer = MockTokenizer
    pre_process.remover = MockStopWordsRemover
    pre_process.count = MockCountVectorizer
    pre_process.idf = MockIDF

    # Call the pre_process function with the test DataFrame
    result = pre_process(df)

    # Verify that the Spark functions were called correctly
    MockTokenizer.assert_called_once_with(inputCol="text", outputCol="words")
    MockStopWordsRemover.assert_called_once_with(inputCol="words", outputCol="word_clean")
    MockCountVectorizer.assert_called_once_with(inputCol="word_clean", outputCol="rawFeatures")
    MockIDF.assert_called_once_with(inputCol="rawFeatures", outputCol="features")

    # Verify that the result is not None
    assert result is not None

In [None]:
if __name__ == '__main__':
    pytest.main()