> Create a virtual Env.
> Example:
>
> 1. create a simple new env
>
>    ```
>    pyenv virtualenv 3.10.6 .envPyspark
>    pyenv activate .envPyspark
>    ```
>
> 2. install the requirements
>    ```
>    pip install --upgrade pip 
>
>    pip install ipykernel pyspark==3.4.0
>    ```
>
> 3. Set as the kernel for your notebook

# Create the spark session

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as F

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

spark: SparkSession = (
    SparkSession.builder
        .appName('Visualizations')
        .getOrCreate()
    )

df: DataFrame = (
    spark.range(10000)
)

# Basic visualization

In [45]:
df.summary()

                                                                                

summary,id
count,10000.0
mean,4999.5
stddev,2886.8956799071675
min,0.0
25%,2499.0
50%,4999.0
75%,7499.0
max,9999.0


# Sample

## Sample unique

In [46]:
(
    df
    .groupBy('id')
    .agg(F.count(F.col('id')).alias('qt'))
    .filter("qt > 1")
).show()


+---+---+
| id| qt|
+---+---+
+---+---+



## Sample with duplicates

In [44]:
df_sample: DataFrame = df.sample(seed=42, fraction=0.1, withReplacement= True)

(
    df_sample
    .groupBy('id')
    .agg(F.count(F.col('id')).alias('qt'))
    .filter("qt > 1")
).show()

+----+---+
|  id| qt|
+----+---+
| 164|  2|
| 516|  2|
| 653|  2|
| 821|  2|
| 883|  2|
|1289|  2|
|1324|  2|
|1798|  2|
|1993|  2|
|2034|  2|
|2139|  2|
|2912|  2|
|3290|  2|
|3847|  2|
|3883|  2|
|4038|  2|
|4068|  2|
|4122|  3|
|4134|  2|
|4275|  2|
+----+---+
only showing top 20 rows



# References

1. https://spark.apache.org/docs/3.4.0/api/python/reference/index.html