In [1]:
import os
import sys

SPARK_HOME = "/usr/hdp/current/spark2-client"
PYSPARK_PYTHON = "/opt/conda/envs/dsenv/bin/python"
os.environ["PYSPARK_PYTHON"]= PYSPARK_PYTHON
os.environ["SPARK_HOME"] = SPARK_HOME

PYSPARK_HOME = os.path.join(SPARK_HOME, "python/lib")
sys.path.insert(0, os.path.join(PYSPARK_HOME, "py4j-0.10.7-src.zip"))
sys.path.insert(0, os.path.join(PYSPARK_HOME, "pyspark.zip"))

In [2]:
import random
SPARK_UI_PORT = random.choice(range(10000, 11000))
print(f"Spark UI port: {SPARK_UI_PORT}")

Spark UI port: 10756


In [3]:
%pylab inline

Matplotlib is building the font cache; this may take a moment.


Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [4]:
MOVIELENS_USERS_FILE = "/datasets/spark/ml-100k/u.user"
LOG_FILE = "/datasets/spark/logsM.txt"
IPS_FILE = "/datasets/spark/ipDataM.txt"

## Dataframe API

### Dataframe:
+ структурированная колоночная структура данных
+ может быть создана на основе:
  - локальной коллекции
  - файла (файлов)
  - базы данных
+ в Python работает значительно быстрее, чем RDD
+ под капотом использует RDD
+ позволяет выполнять произвольные SQL операции с данными
+ аналогично RDD являются ленивыми и неизменяеыми

### Из чего состоит Dataframe?
+ схема [pyspsark.sql.StructType](https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#pyspark.sql.types.StructType)
+ колонки [pyspark.sql.Column](https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#pyspark.sql.Column)
+ данные [pyspark.sql.Row](https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#pyspark.sql.Row)

## Основной управляющий объект в Spark SQL - [SparkSession](https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#pyspark.sql.SparkSession)

In [5]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.ui.port", SPARK_UI_PORT)

spark = SparkSession.builder.config(conf=conf).appName("Spark SQL").getOrCreate()

In [6]:
spark

In [7]:
spark.sparkContext

## Как создать DataFrame?

### Прочитать данные из файла

In [8]:
spark.read

<pyspark.sql.readwriter.DataFrameReader at 0x7fde2bd5e110>

### Конвейер чтения данных
```python
spark.read\
     .format(...)\
     .option(key, value)\
     .option(key, value)\
     .load(path)
```

In [9]:
%%time
df = spark.read\
          .format("csv")\
          .option("sep", "|")\
          .load(MOVIELENS_USERS_FILE)

CPU times: user 2.19 ms, sys: 3.62 ms, total: 5.81 ms
Wall time: 11.7 s


In [10]:
%%time
rdd = spark.sparkContext.textFile(MOVIELENS_USERS_FILE)

CPU times: user 3.05 ms, sys: 0 ns, total: 3.05 ms
Wall time: 64 ms


### Откуда такая разница?

In [12]:
df

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string]

In [13]:
df.show(1)

+---+---+---+----------+-----+
|_c0|_c1|_c2|       _c3|  _c4|
+---+---+---+----------+-----+
|  1| 24|  M|technician|85711|
+---+---+---+----------+-----+
only showing top 1 row



In [14]:
rdd.take(1)

['1|24|M|technician|85711']

### Схема!

In [15]:
from pyspark.sql.types import *

In [16]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("age", IntegerType()),
    StructField("gender", StringType()),
    StructField("occupation", StringType()),
    StructField("zip", IntegerType())
])

In [17]:
%%time
df = spark.read\
          .schema(schema)\
          .format("csv")\
          .option("sep", "|")\
          .load(MOVIELENS_USERS_FILE)

CPU times: user 2.89 ms, sys: 4 ms, total: 6.89 ms
Wall time: 35.1 ms


In [18]:
df

DataFrame[user_id: int, age: int, gender: string, occupation: string, zip: int]

In [19]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip: integer (nullable = true)



In [20]:
df.show(5)

+-------+---+------+----------+-----+
|user_id|age|gender|occupation|  zip|
+-------+---+------+----------+-----+
|      1| 24|     M|technician|85711|
|      2| 53|     F|     other|94043|
|      3| 23|     M|    writer|32067|
|      4| 24|     M|technician|43537|
|      5| 33|     F|     other|15213|
+-------+---+------+----------+-----+
only showing top 5 rows



### Схему также можно задавать в виде SQL DDL

In [21]:
ddl_schema = """
    user_id INT,
    age INT,
    gender STRING,
    occupation STRING,
    zip INT
"""

In [22]:
%%time
df = spark.read\
          .schema(ddl_schema)\
          .format("csv")\
          .option("sep", "|")\
          .load(MOVIELENS_USERS_FILE)

CPU times: user 5.21 ms, sys: 95 µs, total: 5.31 ms
Wall time: 220 ms


In [23]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- zip: integer (nullable = true)



### Наличие схемы оборачивается еще одним приятным бонусом - правильно работает описательная статистика.

In [24]:
df.summary().show()

+-------+-----------------+-----------------+------+-------------+------------------+
|summary|          user_id|              age|gender|   occupation|               zip|
+-------+-----------------+-----------------+------+-------------+------------------+
|  count|              925|              925|   925|          925|               925|
|   mean|470.2908108108108|34.06054054054054|  null|         null| 50868.78810810811|
| stddev|272.1030147185632|12.25807489536592|  null|         null|30891.373254138176|
|    min|                1|                7|     F|administrator|                 0|
|    25%|              236|               25|  null|         null|             21227|
|    50%|              469|               31|  null|         null|             53711|
|    75%|              705|               43|  null|         null|             78741|
|    max|              943|               73|     M|       writer|             99835|
+-------+-----------------+-----------------+------+--

### Что значит `nullable`?

In [25]:
tiny_schema = StructType(fields=[
    StructField("id", IntegerType()),
    StructField("value", StringType(), nullable=False)
])

In [26]:
df = spark.createDataFrame([[1, None], [3, "Hello"]], schema=tiny_schema)

ValueError: field value: This field is not nullable, but got None

In [27]:
tiny_schema = StructType(fields=[
    StructField("id", IntegerType()),
    StructField("value", StringType(), nullable=True)
])

In [28]:
df = spark.createDataFrame([[1, None], [3, "Hello"]], schema=tiny_schema)

In [29]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- value: string (nullable = true)



### На самом деле конвейер чтения выглядит так
```python
spark.read\
     .schema(schema)\
     .format(...)\
     .option(key, value)\
     .option(key, value)\
     .load(path)
```

### Для популярных источников есть удобные обертки

In [30]:
df = spark.read.csv(MOVIELENS_USERS_FILE, schema=schema, sep="|")

In [31]:
df

DataFrame[user_id: int, age: int, gender: string, occupation: string, zip: int]

In [32]:
df.show(5)

+-------+---+------+----------+-----+
|user_id|age|gender|occupation|  zip|
+-------+---+------+----------+-----+
|      1| 24|     M|technician|85711|
|      2| 53|     F|     other|94043|
|      3| 23|     M|    writer|32067|
|      4| 24|     M|technician|43537|
|      5| 33|     F|     other|15213|
+-------+---+------+----------+-----+
only showing top 5 rows



### А самое классное, что через один API можно работать с множеством источников!
+ CSV
+ JSON
+ Hive
+ HBase
+ Cassandra
+ MySQL
+ PostgreSQL
+ Parquet
+ ORC
+ Kafka
+ ElasticSearch
+ Amazon S3
+ ...and more through custom connectors

### DataFrame можно создать из RDD, pandas.DataFrame или iterable

In [33]:
rdd = spark.sparkContext.textFile(MOVIELENS_USERS_FILE).map(lambda x: x.split("|"))

In [34]:
rdd.take(5)

[['1', '24', 'M', 'technician', '85711'],
 ['2', '53', 'F', 'other', '94043'],
 ['3', '23', 'M', 'writer', '32067'],
 ['4', '24', 'M', 'technician', '43537'],
 ['5', '33', 'F', 'other', '15213']]

In [35]:
df = spark.createDataFrame(rdd)

### RDD нетипизирован и никакой схемы не имеет

In [36]:
df

DataFrame[_1: string, _2: string, _3: string, _4: string, _5: string]

In [37]:
df = spark.createDataFrame(rdd, schema=schema)

In [38]:
df

DataFrame[user_id: int, age: int, gender: string, occupation: string, zip: int]

In [39]:
df.show(5)

Py4JJavaError: An error occurred while calling o221.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 9.0 failed 4 times, most recent failure: Lost task 0.3 in stage 9.0 (TID 12, name2.ru-central1.internal, executor 7): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/serializers.py", line 400, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/session.py", line 730, in prepare
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1389, in verify
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1370, in verify_struct
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1389, in verify
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1315, in verify_integer
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1278, in verify_acceptable_types
TypeError: field user_id: IntegerType can not accept object '1' in type <class 'str'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1925)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1913)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1912)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1912)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:948)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:948)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2146)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2095)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2084)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:759)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/serializers.py", line 400, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/hadoop/yarn/local/usercache/vova-cmc/appcache/application_1615841176380_1289/container_e16_1615841176380_1289_01_000008/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/session.py", line 730, in prepare
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1389, in verify
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1370, in verify_struct
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1389, in verify
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1315, in verify_integer
  File "/usr/hdp/current/spark2-client/python/lib/pyspark.zip/pyspark/sql/types.py", line 1278, in verify_acceptable_types
TypeError: field user_id: IntegerType can not accept object '1' in type <class 'str'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


### Игнорировать верификацию схемы - плохая идея

In [40]:
df = spark.createDataFrame(rdd, schema=schema, verifySchema=False)

In [41]:
df.show(5)

+-------+----+------+----------+----+
|user_id| age|gender|occupation| zip|
+-------+----+------+----------+----+
|   null|null|     M|technician|null|
|   null|null|     F|     other|null|
|   null|null|     M|    writer|null|
|   null|null|     M|technician|null|
|   null|null|     F|     other|null|
+-------+----+------+----------+----+
only showing top 5 rows



### Нужно явно привести типы

In [42]:
rdd = rdd.map(lambda x: (int(x[0]), int(x[1]), x[2], x[3], int(x[4])))

In [43]:
%%time
df = spark.createDataFrame(rdd, schema=schema)

CPU times: user 14.3 ms, sys: 17 µs, total: 14.3 ms
Wall time: 30.3 ms


In [44]:
df.show(5)

+-------+---+------+----------+-----+
|user_id|age|gender|occupation|  zip|
+-------+---+------+----------+-----+
|      1| 24|     M|technician|85711|
|      2| 53|     F|     other|94043|
|      3| 23|     M|    writer|32067|
|      4| 24|     M|technician|43537|
|      5| 33|     F|     other|15213|
+-------+---+------+----------+-----+
only showing top 5 rows



## Работать будем со сгенерированным логом доступа

In [45]:
!hdfs dfs -tail $LOG_FILE

 x64; Trident/5.0; .NET CLR 3.5.30729;)
247.182.249.253	20140426165946	http://news.yandex.ru/7686791	1560	202	Opera/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729;)
197.72.248.141	20140426170846	http://news.yandex.ru/1949655	1175	404	Opera/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; chromeframe/12.0.742.112)
168.146.187.80	20140426171807	http://news.mail.ru/7107147	1020	434	Opera/5.0 compatible; MSIE 9.0; Windows NT 7.0; Trident/5.0; .NET CLR 2.2.50767;)
75.208.40.166	20140426180003	http://news.yandex.ru/4696319	526	449	Safari/5.0 compatible; MSIE 9.0; Windows NT 7.0; Trident/5.0; .NET CLR 2.2.50767;)
33.49.147.163	20140426182902	http://news.mail.ru/2829289	82	510	Opera/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729)
33.49.147.163	20140426191049	http://news.rambler.ru/4707594	1043	206	Opera/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Tride

In [46]:
log_schema = StructType(fields=[
    StructField("ip", StringType()),
    StructField("timestamp", LongType()),
    StructField("url", StringType()),
    StructField("size", IntegerType()),
    StructField("code", IntegerType()),
    StructField("ua", StringType())
])

In [47]:
log = spark.read.csv(LOG_FILE, sep="\t", schema=log_schema)

In [48]:
log

DataFrame[ip: string, timestamp: bigint, url: string, size: int, code: int, ua: string]

In [49]:
log.rdd.getNumPartitions()

1

In [50]:
log = log.repartition(4).cache()

In [51]:
log.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------
 ip        | 49.105.15.79                                                                                                   
 timestamp | 20140127041332                                                                                                 
 url       | http://lenta.ru/5567208                                                                                        
 size      | 184                                                                                                            
 code      | 509                                                                                                            
 ua        | Chrome/5.0 compatible; MSIE 9.0; Windows NT 7.0; Trident/5.0; .NET CLR 2.2.50767;)                             
-RECORD 1-------------------------------------------------------------------------------------------------------------------


## Проекции и фильтры
**Проекция** возвращает подмножество столбцов

**Фильтр** возвращает подмножество строк

In [52]:
log.select(["ip", "timestamp", "url"])

DataFrame[ip: string, timestamp: bigint, url: string]

In [53]:
log.select("ip", "code").show(5)

+--------------+----+
|            ip|code|
+--------------+----+
|  49.105.15.79| 509|
|222.131.187.37| 504|
|222.131.187.37| 502|
| 33.49.147.163| 409|
| 33.49.147.163| 409|
+--------------+----+
only showing top 5 rows



In [57]:
log.select(log.ip, log.code).show(5)

+--------------+----+
|            ip|code|
+--------------+----+
|  49.105.15.79| 509|
|222.131.187.37| 504|
|222.131.187.37| 502|
| 33.49.147.163| 409|
| 33.49.147.163| 409|
+--------------+----+
only showing top 5 rows



In [54]:
log.ip

Column<b'ip'>

## Зачем нужны столбцы-объекты? Потому что у них есть методы!

In [55]:
log.select(log.ip,
           log.code.alias("response")).show(5)

+--------------+--------+
|            ip|response|
+--------------+--------+
|  49.105.15.79|     509|
|222.131.187.37|     504|
|222.131.187.37|     502|
| 33.49.147.163|     409|
| 33.49.147.163|     409|
+--------------+--------+
only showing top 5 rows



In [56]:
import pyspark.sql.functions as f

In [57]:
log.select("ip", 
           f.col("code").alias("response")).show(5)

+--------------+--------+
|            ip|response|
+--------------+--------+
|  49.105.15.79|     509|
|222.131.187.37|     504|
|222.131.187.37|     502|
| 33.49.147.163|     409|
| 33.49.147.163|     409|
+--------------+--------+
only showing top 5 rows



## Pandas-like

In [58]:
log[["ip", "code"]].show(5)

+--------------+----+
|            ip|code|
+--------------+----+
|  49.105.15.79| 509|
|222.131.187.37| 504|
|222.131.187.37| 502|
| 33.49.147.163| 409|
| 33.49.147.163| 409|
+--------------+----+
only showing top 5 rows



In [59]:
log[[log.ip, log.code.alias("response")]].show(5)

+--------------+--------+
|            ip|response|
+--------------+--------+
|  49.105.15.79|     509|
|222.131.187.37|     504|
|222.131.187.37|     502|
| 33.49.147.163|     409|
| 33.49.147.163|     409|
+--------------+--------+
only showing top 5 rows



## Фильтрация

In [60]:
log.where("code = 200").show(5)

+--------------+--------------+--------------------+----+----+--------------------+
|            ip|     timestamp|                 url|size|code|                  ua|
+--------------+--------------+--------------------+----+----+--------------------+
| 75.208.40.166|20140326084231|http://news.yande...| 955| 200|Opera/5.0 (Window...|
| 75.208.40.166|20140408161249|http://news.mail....| 169| 200|Opera/5.0 (Window...|
|197.72.248.141|20140404122749|http://newsru.com...| 884| 200|Opera/5.0 (compat...|
|56.167.169.126|20140422220256|http://lenta.ru/3...| 563| 200|Chrome/5.0 (compa...|
|168.255.93.197|20140410032254|http://news.mail....| 931| 200|Safari/5.0 (compa...|
+--------------+--------------+--------------------+----+----+--------------------+
only showing top 5 rows



In [61]:
log.filter(log.code == 200).show(5, truncate=False)

+--------------+--------------+-----------------------------+----+----+-------------------------------------------------------------------------------------------------------------+
|ip            |timestamp     |url                          |size|code|ua                                                                                                           |
+--------------+--------------+-----------------------------+----+----+-------------------------------------------------------------------------------------------------------------+
|75.208.40.166 |20140326084231|http://news.yandex.ru/2002966|955 |200 |Opera/5.0 (Windows; U; MSIE 9.0; Windows NT 8.1; Trident/5.0; .NET4.0E; en-AU)                               |
|75.208.40.166 |20140408161249|http://news.mail.ru/7147672  |169 |200 |Opera/5.0 (Windows; U; MSIE 9.0; Windows NT 8.0; Win64; x64; Trident/5.0; .NET4.0E; en)                      |
|197.72.248.141|20140404122749|http://newsru.com/7833710    |884 |200 |Opera/5.0 (compatib

In [62]:
log.filter("code == 200 AND url LIKE '%rambler%'").show(5, truncate=False, vertical=True)

-RECORD 0-----------------------------------------------------------------------------------------------------
 ip        | 75.208.40.166                                                                                    
 timestamp | 20140216075757                                                                                   
 url       | http://news.rambler.ru/8637466                                                                   
 size      | 1337                                                                                             
 code      | 200                                                                                              
 ua        | Firefox/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729;) 
-RECORD 1-----------------------------------------------------------------------------------------------------
 ip        | 56.167.169.126                                                                                   
 

In [63]:
log.filter((log.code.isin([200, 404])) & (log.url.like("%rambler%"))).show(5)

+---------------+--------------+--------------------+----+----+--------------------+
|             ip|     timestamp|                 url|size|code|                  ua|
+---------------+--------------+--------------------+----+----+--------------------+
|   3.183.113.77|20140416052008|http://news.rambl...| 363| 404|Safari/5.0 (Windo...|
|  75.208.40.166|20140216075757|http://news.rambl...|1337| 200|Firefox/5.0 (comp...|
| 56.167.169.126|20140419115529|http://news.rambl...|1495| 200|Opera/5.0 (compat...|
| 110.91.102.196|20140314120938|http://news.rambl...| 816| 404|Opera/5.0 (compat...|
|247.182.249.253|20140411154111|http://news.rambl...|1418| 200|Chrome/5.0 (compa...|
+---------------+--------------+--------------------+----+----+--------------------+
only showing top 5 rows



## Pandas

In [64]:
log[(log.code == 200) & (log.url.like("%rambler%"))].show(5)

+---------------+--------------+--------------------+----+----+--------------------+
|             ip|     timestamp|                 url|size|code|                  ua|
+---------------+--------------+--------------------+----+----+--------------------+
|  75.208.40.166|20140216075757|http://news.rambl...|1337| 200|Firefox/5.0 (comp...|
| 56.167.169.126|20140419115529|http://news.rambl...|1495| 200|Opera/5.0 (compat...|
|247.182.249.253|20140411154111|http://news.rambl...|1418| 200|Chrome/5.0 (compa...|
|  33.49.147.163|20140131095711|http://news.rambl...|1908| 200|Safari/5.0 (Windo...|
|   25.62.10.220|20140107010805|http://news.rambl...|1284| 200|Chrome/5.0 compat...|
+---------------+--------------+--------------------+----+----+--------------------+
only showing top 5 rows



## И все вместе

In [65]:
log[(log.code == 200) & (log.url.like("%rambler%"))][["ip", "code"]].show(5)

+---------------+----+
|             ip|code|
+---------------+----+
|  75.208.40.166| 200|
| 56.167.169.126| 200|
|247.182.249.253| 200|
|  33.49.147.163| 200|
|   25.62.10.220| 200|
+---------------+----+
only showing top 5 rows



## А SQL-то можно писать?

In [66]:
query = """
SELECT ip, code FROM log_table 
WHERE code == 200 AND url LIKE '%rambler%'
"""

In [67]:
spark.sql(query).show(5)

AnalysisException: 'Table or view not found: log_table; line 2 pos 21'

### При запуске на YARN `SparkSession` автоматически запускается с поддержкой HIVE. Поэтому надо зарегистрировать `DataFrame` как таблицу во внутреннем каталоге Spark SQL

In [68]:
log.createOrReplaceTempView("log_table")

In [69]:
spark.catalog.listTables()

[Table(name='log_table', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [70]:
spark.sql(query).show(5)

+---------------+----+
|             ip|code|
+---------------+----+
|  75.208.40.166| 200|
| 56.167.169.126| 200|
|247.182.249.253| 200|
|  33.49.147.163| 200|
|   25.62.10.220| 200|
+---------------+----+
only showing top 5 rows



### Какая разница между Temp view и Global Temp View?
`SparkSession` - не singleton в отличие от `SparkContext`

In [71]:
spark2 = spark.newSession()

In [72]:
spark2 is spark

False

In [73]:
spark2.sparkContext is spark.sparkContext

True

## Функции
Не все вычисления можно реализовать стандартным SQL. Здесь на помощь приходят функции (встроенные или пользовательские). Встроенные функции находятся в модуле [`pyspark.sql.functions`](https://spark.apache.org/docs/2.4.7/api/python/pyspark.sql.html#module-pyspark.sql.functions) 

In [74]:
log.select("ua", f.length("ua")).show(5)

+--------------------+----------+
|                  ua|length(ua)|
+--------------------+----------+
|Chrome/5.0 compat...|        82|
|Chrome/5.0 (compa...|        95|
|Safari/5.0 (Windo...|        88|
|Firefox/5.0 (comp...|       110|
|Chrome/5.0 compat...|        82|
+--------------------+----------+
only showing top 5 rows



### Функции возвращают объект типа `Column`

In [75]:
f.length("ua")

Column<b'length(ua)'>

In [76]:
log.select("ua", f.length("ua").alias("length")).show(5)

+--------------------+------+
|                  ua|length|
+--------------------+------+
|Chrome/5.0 compat...|    82|
|Chrome/5.0 (compa...|    95|
|Safari/5.0 (Windo...|    88|
|Firefox/5.0 (comp...|   110|
|Chrome/5.0 compat...|    82|
+--------------------+------+
only showing top 5 rows



### Довольно часто возникает ошибка с неверными именами столбцов

In [77]:
log.select(f.concat("url", "?utm_medium=email")).show(5)

AnalysisException: "cannot resolve '`?utm_medium=email`' given input columns: [timestamp, size, code, ua, ip, url];;\n'Project [concat(url#583, '?utm_medium=email) AS concat(url, ?utm_medium=email)#1299]\n+- Repartition 4, true\n   +- Relation[ip#581,timestamp#582L,url#583,size#584,code#585,ua#586] csv\n"

### Нужно убедиться в правильности имени или типа

In [78]:
log.select(f.concat("url", f.lit("?utm_medium=email")).alias("newurl")).show(5, False)

+----------------------------------------------+
|newurl                                        |
+----------------------------------------------+
|http://lenta.ru/5567208?utm_medium=email      |
|http://news.mail.ru/7703130?utm_medium=email  |
|http://lenta.ru/1035910?utm_medium=email      |
|http://news.yandex.ru/3336146?utm_medium=email|
|http://newsru.com/9238321?utm_medium=email    |
+----------------------------------------------+
only showing top 5 rows



## Взрывы!
Посчитаем word count

In [79]:
log.select("ua", f.split("ua", " ").alias("word_list")).show(5, False, True)

-RECORD 0-----------------------------------------------------------------------------------------------------------------------------------
 ua        | Chrome/5.0 compatible; MSIE 9.0; Windows NT 7.0; Trident/5.0; .NET CLR 2.2.50767;)                                             
 word_list | [Chrome/5.0, compatible;, MSIE, 9.0;, Windows, NT, 7.0;, Trident/5.0;, .NET, CLR, 2.2.50767;)]                                 
-RECORD 1-----------------------------------------------------------------------------------------------------------------------------------
 ua        | Chrome/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729;)                                
 word_list | [Chrome/5.0, (compatible;, MSIE, 9.0;, Windows, NT, 6.1;, Win64;, x64;, Trident/5.0;, .NET, CLR, 3.5.30729;)]                  
-RECORD 2-----------------------------------------------------------------------------------------------------------------------------------
 ua        | 

### Прелесть ленивых вычислений и строгой типизации

In [80]:
log.select("ua", f.split("ua", " ").alias("word_list")).printSchema()

root
 |-- ua: string (nullable = true)
 |-- word_list: array (nullable = true)
 |    |-- element: string (containsNull = true)



### К элементам сложных типов можно получить доступ

In [82]:
log.select("ua", f.split("ua", " ").alias("word_list"))\
   .select(f.col("word_list")[0].alias("first_word"), f.col("word_list")[1].alias("second_word"))\
   .show(5)

+-----------+------------+
| first_word| second_word|
+-----------+------------+
| Chrome/5.0| compatible;|
| Chrome/5.0|(compatible;|
| Safari/5.0|   (Windows;|
|Firefox/5.0|(compatible;|
| Chrome/5.0| compatible;|
+-----------+------------+
only showing top 5 rows



In [83]:
log.select("ua", f.split("ua", " ").alias("word_list"))\
   .select(f.explode("word_list").alias("word"))\
   .groupby("word").count()\
   .orderBy("count", ascending=False)\
   .show(5)

+------------+-----+
|        word|count|
+------------+-----+
|        MSIE|10092|
|        9.0;|10092|
|          NT|10092|
|     Windows|10092|
|Trident/5.0;| 9121|
+------------+-----+
only showing top 5 rows



### Самое время посмотреть в Spark UI!

## Joins

In [84]:
!hdfs dfs -tail $IPS_FILE

96.67	Khanty–Mansi
197.72.248.141	Chechnya
33.49.147.163	Nizhny Novgorod Oblast
56.167.169.126	Voronezh Oblast
3.183.113.77	Astrakhan Oblast
56.167.169.126	Tver Oblast
56.167.169.126	Kabardino-Balkaria
56.167.169.126	Nenets Autonomous Okrug
33.49.147.163	Omsk Oblast
14.8.59.211	Khabarovsk Krai
75.208.40.166	Sakha
135.124.143.193	Samara Oblast
75.208.40.166	Novosibirsk Oblast
75.208.40.166	Amur Oblast
75.208.40.166	Karelia
75.208.40.166	Saint Petersburg
181.217.177.35	Samara Oblast
33.49.147.163	Irkutsk Oblast
56.167.169.126	Lipetsk Oblast
181.217.177.35	Kalmykia
168.255.93.197	Volgograd Oblast
168.255.93.197	Oryol Oblast
168.255.93.197	Kurgan Oblast
168.146.187.80	Primorsky Krai
49.105.15.79	North Ossetia–Alania
197.72.248.141	Stavropol Krai
14.8.59.211	Kemerovo Oblast
49.203.96.67	Ulyanovsk Oblast
222.131.187.37	Yamalo-Nenets
197.72.248.141	Zabaykalsky Krai
222.131.187.37	Kaluga Oblast
3.183.113.77	Saratov Oblast
168.255.93.197	Astrakhan Oblast
75.208.

In [85]:
ip_schema = StructType(fields=[
    StructField("ip", StringType()),
    StructField("region", StringType())
])

In [86]:
ips = spark.read.csv(IPS_FILE, schema=ip_schema, sep="\t").cache()

In [87]:
ips.show(5)

+--------------+------------------+
|            ip|            region|
+--------------+------------------+
|  49.105.15.79|              Komi|
|110.91.102.196|Chelyabinsk Oblast|
|56.167.169.126|  Saint Petersburg|
| 75.208.40.166|  Ulyanovsk Oblast|
|168.255.93.197|    Irkutsk Oblast|
+--------------+------------------+
only showing top 5 rows



### Трюк для отключения автоматической оптимизации

In [88]:
spark.sql("SET spark.sql.autoBroadcastJoinThreshold = -1")

DataFrame[key: string, value: string]

In [89]:
log_with_regions = log.join(ips, on="ip", how="inner")

In [90]:
log_with_regions

DataFrame[ip: string, timestamp: bigint, url: string, size: int, code: int, ua: string, region: string]

In [91]:
log_with_regions.show(5)

+------------+--------------+--------------------+----+----+--------------------+-------------------+
|          ip|     timestamp|                 url|size|code|                  ua|             region|
+------------+--------------+--------------------+----+----+--------------------+-------------------+
|3.183.113.77|20140416052008|http://news.rambl...| 363| 404|Safari/5.0 (Windo...|           Chukotka|
|3.183.113.77|20140416052008|http://news.rambl...| 363| 404|Safari/5.0 (Windo...|     Ivanovo Oblast|
|3.183.113.77|20140416052008|http://news.rambl...| 363| 404|Safari/5.0 (Windo...|          Tatarstan|
|3.183.113.77|20140416052008|http://news.rambl...| 363| 404|Safari/5.0 (Windo...|Karachay–Cherkessia|
|3.183.113.77|20140416052008|http://news.rambl...| 363| 404|Safari/5.0 (Windo...|   Yaroslavl Oblast|
+------------+--------------+--------------------+----+----+--------------------+-------------------+
only showing top 5 rows



### Можно делать не только equi-join, но и по произвольному выражению

In [92]:
ips_with_ts = ips.withColumn("timestamp", f.lit(20140127041332).cast("bigint"))

In [93]:
log.join(ips_with_ts, on=((log.ip == ips_with_ts.ip) & (log.timestamp > ips_with_ts.timestamp))).count()

6314928

In [94]:
log_with_regions.count()

8214603

### А что там с партициями?

In [97]:
(log.rdd.getNumPartitions(),
 ips.rdd.getNumPartitions(),
 log_with_regions.rdd.getNumPartitions())

(4, 1, 4)

In [99]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [96]:
log_with_regions = log_with_regions.coalesce(4).cache()

### По-умолчанию Spark SQL использует алгоритм SortMergeJoin

In [98]:
log_with_regions.explain(extended=True)

== Parsed Logical Plan ==
Repartition 4, false
+- Project [ip#581, timestamp#582L, url#583, size#584, code#585, ua#586, region#1518]
   +- Join Inner, (ip#581 = ip#1517)
      :- Repartition 4, true
      :  +- Relation[ip#581,timestamp#582L,url#583,size#584,code#585,ua#586] csv
      +- Relation[ip#1517,region#1518] csv

== Analyzed Logical Plan ==
ip: string, timestamp: bigint, url: string, size: int, code: int, ua: string, region: string
Repartition 4, false
+- Project [ip#581, timestamp#582L, url#583, size#584, code#585, ua#586, region#1518]
   +- Join Inner, (ip#581 = ip#1517)
      :- Repartition 4, true
      :  +- Relation[ip#581,timestamp#582L,url#583,size#584,code#585,ua#586] csv
      +- Relation[ip#1517,region#1518] csv

== Optimized Logical Plan ==
InMemoryRelation [ip#581, timestamp#582L, url#583, size#584, code#585, ua#586, region#1518], StorageLevel(disk, memory, deserialized, 1 replicas)
   +- Coalesce 4
      +- *(5) Project [ip#581, timestamp#582L, url#583, size#584,

### Если одна из таблиц мала, то можно реализовать map-side join через broadcast

In [100]:
log_with_regions = log.join(f.broadcast(ips), on="ip", how="inner")

In [101]:
log_with_regions.explain()

== Physical Plan ==
*(2) Project [ip#581, timestamp#582L, url#583, size#584, code#585, ua#586, region#1518]
+- *(2) BroadcastHashJoin [ip#581], [ip#1517], Inner, BuildRight
   :- *(2) Filter isnotnull(ip#581)
   :  +- InMemoryTableScan [ip#581, timestamp#582L, url#583, size#584, code#585, ua#586], [isnotnull(ip#581)]
   :        +- InMemoryRelation [ip#581, timestamp#582L, url#583, size#584, code#585, ua#586], StorageLevel(disk, memory, deserialized, 1 replicas)
   :              +- Exchange RoundRobinPartitioning(4)
   :                 +- *(1) FileScan csv [ip#581,timestamp#582L,url#583,size#584,code#585,ua#586] Batched: false, Format: CSV, Location: InMemoryFileIndex[hdfs://name1.ru-central1.internal:8020/datasets/spark/logsM.txt], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ip:string,timestamp:bigint,url:string,size:int,code:int,ua:string>
   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, false]))
      +- *(1) Filter isnotnull(ip#1517)
     

## Аггрегация
```python
df.groupBy(*cols)\
  .agg(*expressions)
```

In [102]:
log_with_regions.groupBy("region")\
                .agg(f.count("ip").alias("count"))\
                .orderBy("count", ascending=False)\
                .show(10)

+------------------+------+
|            region| count|
+------------------+------+
|  Ulyanovsk Oblast|204275|
|            Jewish|134523|
|  Saint Petersburg|129362|
|Arkhangelsk Oblast|124937|
|    Vologda Oblast|122363|
|   Novgorod Oblast|122306|
|     Moscow Oblast|120336|
|  Krasnoyarsk Krai|119285|
|              Komi|117659|
|          Kalmykia|117172|
+------------------+------+
only showing top 10 rows



In [103]:
log_with_regions.groupBy("region")\
                .count()\
                .withColumnRenamed("count", "row_count")\
                .orderBy("row_count", ascending=False)\
                .show(10)

+------------------+---------+
|            region|row_count|
+------------------+---------+
|  Ulyanovsk Oblast|   204275|
|            Jewish|   134523|
|  Saint Petersburg|   129362|
|Arkhangelsk Oblast|   124937|
|    Vologda Oblast|   122363|
|   Novgorod Oblast|   122306|
|     Moscow Oblast|   120336|
|  Krasnoyarsk Krai|   119285|
|              Komi|   117659|
|          Kalmykia|   117172|
+------------------+---------+
only showing top 10 rows



In [105]:
length_stat = log_with_regions.groupBy(f.length("url").alias("url_length"))\
                              .agg(f.count("*").alias("row_count"))\
                              .orderBy("row_count", ascending=False)\
                              .toPandas()

In [108]:
length_stat

Unnamed: 0,url_length,row_count
0,23,1676363
1,27,1644000
2,25,1639043
3,29,1638174
4,30,1617023


## UDF (User Defined Functions)
The function type of the UDF can be one of the following:
+ **SCALAR**. A scalar UDF defines a transformation: One or more `pandas.Series` -> A `pandas.Series`. calar UDFs are used with `pyspark.sql.DataFrame.withColumn()` and `pyspark.sql.DataFrame.select()`
+ **GROUPED_MAP**. A grouped map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`. Grouped map UDFs are used with `pyspark.sql.GroupedData.apply()`

In [109]:
@f.pandas_udf(StringType())
def encode_http_status(codes):
    mapping = {
        1: "info",
        2: "success",
        3: "redirect",
        4: "client error",
        5: "server error"
    }
    return (codes // 100).replace(mapping)

In [110]:
log.withColumn("http_status", encode_http_status("code"))\
   .groupBy("http_status").count().show()

+------------+-----+
| http_status|count|
+------------+-----+
|     success| 1422|
|    redirect| 1360|
|client error| 4806|
|server error| 1897|
|        info|  607|
+------------+-----+



In [111]:
spark.stop()