In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder.appName("tmp").getOrCreate()

In [22]:
sc = spark.sparkContext

In [23]:
conf = sc.getConf()

In [24]:
conf.getAll()

[('spark.driver.host', '192.168.0.114'),
 ('spark.driver.extraClassPath',
  '/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/jars/sqlite-jdbc-3.27.2.1.jar'),
 ('spark.executor.id', 'driver'),
 ('spark.sql.warehouse.dir', '/tmp/hive/spark-warehouse'),
 ('spark.executor.extraClassPath',
  '/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/jars/sqlite-jdbc-3.27.2.1.jar'),
 ('spark.app.id', 'local-1621823740311'),
 ('spark.app.name', 'tmp'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '37147'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
df = spark.range(100).toDF("cnt")

In [5]:
from pyspark.sql.functions import udf
age_range = udf(lambda age: 
                   '< 20' if age < 20 else 
                   '20-40' if (age >= 20 and age < 40) else
                   '40-60' if (age >= 40 and age < 60) else
                   '60-80' if (age >= 60 and age < 80) else
                   '80+'  if (age >= 80) else ''
)

In [6]:
df = df.withColumn('cnt_bin', age_range(df.cnt))

In [7]:
df.show()

+---+-------+
|cnt|cnt_bin|
+---+-------+
|  0|   < 20|
|  1|   < 20|
|  2|   < 20|
|  3|   < 20|
|  4|   < 20|
|  5|   < 20|
|  6|   < 20|
|  7|   < 20|
|  8|   < 20|
|  9|   < 20|
| 10|   < 20|
| 11|   < 20|
| 12|   < 20|
| 13|   < 20|
| 14|   < 20|
| 15|   < 20|
| 16|   < 20|
| 17|   < 20|
| 18|   < 20|
| 19|   < 20|
+---+-------+
only showing top 20 rows



### SparkConf

In [8]:
from pyspark import SparkConf, SparkContext

In [9]:
conf = SparkConf()

In [12]:
print(conf.get("spark.executor.memory"))

None


In [18]:
conf.set("spark.executor.memory", "4g")

<pyspark.conf.SparkConf at 0x7fb0e746d940>

In [13]:
print(conf.get("spark.master"))

local[*]


In [14]:
print(conf.get("spark.eventLog.enabled"))

None


In [15]:
print(conf.get("spark.serializer"))

None


In [20]:
print(conf.get("spark.driver.cores"))

None


In [21]:
print(conf.get("spark.driver.memory"))

None


In [19]:
conf.getAll()

[('spark.app.name', 'tmp'),
 ('spark.executor.memory', '4g'),
 ('spark.driver.extraClassPath',
  '/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/jars/sqlite-jdbc-3.27.2.1.jar'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.sql.warehouse.dir', '/tmp/hive/spark-warehouse'),
 ('spark.executor.extraClassPath',
  '/home/wengong/spark/spark-3.0.1-bin-hadoop2.7/jars/sqlite-jdbc-3.27.2.1.jar'),
 ('spark.ui.showConsoleProgress', 'true')]

### instr() function

In [14]:
df = spark.createDataFrame([('abcd',)], ['s',])
res = df.select(F.instr(df.s, 'c').alias('ipos')).collect()

In [15]:
type(res)

list

In [16]:
res[0].ipos

3

In [25]:
data = """
hello spark
unified analytics
distributed processing engine
"""

In [28]:
x = list(map(lambda x: x.split(" "), data.split("\n")))

In [29]:
x

[[''],
 ['hello', 'spark'],
 ['unified', 'analytics'],
 ['distributed', 'processing', 'engine'],
 ['']]

In [30]:
x = [21.0, 6.0, 160.0, 110.0, 3.9, 2.62, 16.46, 0.0, 1.0, 4.0, 4.0]

In [31]:
import numpy as np

In [32]:
np.mean(x)

29.90727272727273

In [33]:
def avg(x):
    s = 0
    for i in x:
        s += i
    return s/len(x)

In [34]:
avg(x)

29.907272727272723