## Setting Environment Variables

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

## DataFrame

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("Sample") \
        .getOrCreate()

spark.version

'3.4.2'

### Sample

In [81]:
df = spark.range(100, 300)
# df.collect() - prints the numbers upto 100 in an order
df.sample(0.1, 123).show()

+---+
| id|
+---+
|144|
|160|
|164|
|165|
|166|
|176|
|179|
|183|
|193|
|199|
|213|
|219|
|234|
|259|
|263|
|268|
|272|
|275|
|287|
|289|
+---+



In [None]:
sampled_df = df.sample(0.1)
count = sampled_df.count()
print(count)

#### With Duplicates

In [54]:
# The first Parameter (withReplacement) is set to True in order to get the Duplicate values
df.sample(True, 0.1, 123).collect()

[Row(id=6),
 Row(id=27),
 Row(id=37),
 Row(id=46),
 Row(id=50),
 Row(id=60),
 Row(id=65),
 Row(id=72),
 Row(id=81),
 Row(id=91),
 Row(id=92),
 Row(id=92)]

#### Without Duplicates

In [56]:
# Set the first parameter to False to get only non duplicate values
df.sample(False, 0.1, 123).collect()

[Row(id=35),
 Row(id=38),
 Row(id=41),
 Row(id=45),
 Row(id=71),
 Row(id=84),
 Row(id=87),
 Row(id=99)]

### SampleBy()

In [71]:
df2 = df.select((df.id % 3).alias("key"))
print(df2.sampleBy("key", {0 : 0.1, 1:0.2}, 0).collect())

[Row(key=0), Row(key=0), Row(key=1), Row(key=1), Row(key=0), Row(key=1), Row(key=0), Row(key=1), Row(key=0), Row(key=0), Row(key=1), Row(key=1), Row(key=0)]


## RDD

In [74]:
# Create an RDD
rdd = spark.sparkContext.range(100, 200)

### Sample

In [76]:
rdd.sample(True, 0.3, 123).collect()

[100,
 111,
 116,
 118,
 119,
 123,
 123,
 124,
 126,
 126,
 127,
 129,
 135,
 138,
 147,
 149,
 154,
 154,
 155,
 161,
 161,
 166,
 168,
 181,
 181,
 182,
 185,
 197,
 199]

### takeSample()

In [80]:
print(rdd.takeSample(True, 10, 123))

[183, 137, 146, 135, 181, 148, 129, 127, 126, 166]
