# Tutorial

**Let's first create an example pyspark DataFrame**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize Spark session
spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()

# Define the schema
schema = StructType(
    [
        StructField("primary_key", IntegerType(), True),
        StructField("email", StringType(), True),
        StructField("number", StringType(), True),
    ]
)

# Define the data
data = [
    (1, "info@woonstadrotterdam.nl", "123"),
    (2, "infowoonstadrotterdam.nl", "01"),
    (3, "@woonstadrotterdam.nl", "-45"),
    (4, "dev@woonstadrotterdam.nl", "1.0"),
    (5, None, None),
]

df = spark.createDataFrame(data, schema)

df.show(truncate=False)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/06/18 11:59:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
+-----------+-------------------------+------+
|primary_key|email                    |number|
+-----------+-------------------------+------+
|1          |info@woonstadrotterdam.nl|123   |
|2          |infowoonstadrotterdam.nl |01    |
|3          |@woonstadrotterdam.nl    |-45   |
|4          |dev@woonstadrotterdam.nl |1.0   |
|5          |null                     |null  |
+-----------+-------------------------+------+



                                                                                

**Import and initialize the `DataFrameTester`**

In [2]:
from testframework.dataquality import DataFrameTester

In [3]:
df_tester = DataFrameTester(
    df=df,
    primary_key="primary_key",
    spark=spark,  # optional. If not provided, a new Spark session will be created
)

**Import configurable tests**

In [4]:
from testframework.dataquality.tests import IntegerString, RegexTest

**Run the `IntegerString` test on the _number_ column**

In [5]:
df_tester.test(
    col="number",
    test=IntegerString(),
    nullable=True,
    description="Value in 'number' column could be converted to integer",
).show()

+-----------+------+---------------------+
|primary_key|number|number__IntegerString|
+-----------+------+---------------------+
|          1|   123|                 true|
|          2|    01|                false|
|          3|   -45|                 true|
|          4|   1.0|                 true|
|          5|  null|                 true|
+-----------+------+---------------------+



**Initialize the `RegexTest` to test for valid email addresses**

In [6]:
valid_email_test = RegexTest(
    name="ValidEmail",
    pattern=r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$",
)

**Run the `.valid_email_test` method on the _email_ column**

In [9]:
df_tester.test(
    col="email",
    test=valid_email_test,
    nullable=True,
    # description is optional, let's not define it for illustration purposes
).show(truncate=False)

+-----------+-------------------------+-----------------+
|primary_key|email                    |email__ValidEmail|
+-----------+-------------------------+-----------------+
|1          |info@woonstadrotterdam.nl|true             |
|2          |infowoonstadrotterdam.nl |false            |
|3          |@woonstadrotterdam.nl    |false            |
|4          |dev@woonstadrotterdam.nl |true             |
|5          |null                     |true             |
+-----------+-------------------------+-----------------+



**Let's take a look at the test results of the DataFrame using the `.results` attribute.**

In [10]:
df_tester.results.show(truncate=False)

+-----------+---------------------+-----------------+
|primary_key|number__IntegerString|email__ValidEmail|
+-----------+---------------------+-----------------+
|1          |true                 |true             |
|2          |false                |false            |
|3          |true                 |false            |
|4          |true                 |true             |
|5          |true                 |true             |
+-----------+---------------------+-----------------+



**We can use `.descriptions` or `.descriptions_df` to get the descriptions of the tests.**    

<br>
This can be useful for reporting purposes.   
For example to create reports for the business with more detailed information than just the column name and the test name.

In [14]:
df_tester.descriptions

{'number__IntegerString': "Value in 'number' column could be converted to integer",
 'email__ValidEmail': 'email__ValidEmail'}

In [11]:
df_tester.description_df.show(truncate=False)

+---------------------+------------------------------------------------------+
|test                 |description                                           |
+---------------------+------------------------------------------------------+
|number__IntegerString|Value in 'number' column could be converted to integer|
|email__ValidEmail    |email__ValidEmail                                     |
+---------------------+------------------------------------------------------+

