### Установка и настройка PySpark

In [None]:
# Установим PySpark, если он не установлен
!pip install pyspark

### Импорт необходимых библиотек

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, lit
from pyspark.sql.types import StringType, IntegerType

### Создание SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("PySpark Tutorial") \
    .getOrCreate()

### Пример данных и создание DataFrame

In [4]:
data = [
    ("John", "Sales", 5000),
    ("Mike", "Sales", 4600),
    ("Sara", "HR", 4000),
    ("Jen", "HR", 4500),
    ("Jeff", "Engineering", 7800),
    ("Tom", "Engineering", 7300)
]
columns = ["Name", "Department", "Salary"]

df = spark.createDataFrame(data, schema=columns)
df.show()

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|John|      Sales|  5000|
|Mike|      Sales|  4600|
|Sara|         HR|  4000|
| Jen|         HR|  4500|
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
+----+-----------+------+



### Выбор и переименование колонок

In [5]:
# Выбор нескольких колонок
df.select("Name", "Salary").show()

# Переименование колонки
df_renamed = df.withColumnRenamed("Department", "Dept")
df_renamed.show()

+----+------+
|Name|Salary|
+----+------+
|John|  5000|
|Mike|  4600|
|Sara|  4000|
| Jen|  4500|
|Jeff|  7800|
| Tom|  7300|
+----+------+

+----+-----------+------+
|Name|       Dept|Salary|
+----+-----------+------+
|John|      Sales|  5000|
|Mike|      Sales|  4600|
|Sara|         HR|  4000|
| Jen|         HR|  4500|
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
+----+-----------+------+



### Добавление и удаление колонок

In [8]:
# Добавление новой колонки с фиксированным значением
df_with_constant = df.withColumn("Country", lit("USA"))
df_with_constant.show()

# Удаление колонки
df_dropped = df.drop("Department")
df_dropped.show()

+----+-----------+------+-------+
|Name| Department|Salary|Country|
+----+-----------+------+-------+
|John|      Sales|  5000|    USA|
|Mike|      Sales|  4600|    USA|
|Sara|         HR|  4000|    USA|
| Jen|         HR|  4500|    USA|
|Jeff|Engineering|  7800|    USA|
| Tom|Engineering|  7300|    USA|
+----+-----------+------+-------+

+----+------+
|Name|Salary|
+----+------+
|John|  5000|
|Mike|  4600|
|Sara|  4000|
| Jen|  4500|
|Jeff|  7800|
| Tom|  7300|
+----+------+



### Фильтрация данных

In [9]:
# Фильтрация строк с условием
filtered_df = df.filter(df.Salary > 5000)
filtered_df.show()

# Фильтрация с использованием SQL выражения
df.filter("Salary > 5000").show()

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
+----+-----------+------+

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
+----+-----------+------+



### Сортировка данных

In [10]:
# Сортировка по возрастанию
df_sorted_asc = df.sort("Salary")
df_sorted_asc.show()

# Сортировка по убыванию
df_sorted_desc = df.sort(df.Salary.desc())
df_sorted_desc.show()

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|Sara|         HR|  4000|
| Jen|         HR|  4500|
|Mike|      Sales|  4600|
|John|      Sales|  5000|
| Tom|Engineering|  7300|
|Jeff|Engineering|  7800|
+----+-----------+------+

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
|John|      Sales|  5000|
|Mike|      Sales|  4600|
| Jen|         HR|  4500|
|Sara|         HR|  4000|
+----+-----------+------+



### Агрегация данных

In [11]:
# Посчитаем максимальную зарплату по каждому отделу
df_max_salary = df.groupBy("Department").max("Salary")
df_max_salary.show()

# Подсчёт количества сотрудников в каждом отделе
df_count = df.groupBy("Department").count()
df_count.show()

+-----------+-----------+
| Department|max(Salary)|
+-----------+-----------+
|      Sales|       5000|
|         HR|       4500|
|Engineering|       7800|
+-----------+-----------+

+-----------+-----+
| Department|count|
+-----------+-----+
|      Sales|    2|
|         HR|    2|
|Engineering|    2|
+-----------+-----+



### Работа с NULL значениями

In [12]:
# Заполнение NULL значений в колонке 'Salary' значением 0
df_filled = df.fillna({'Salary': 0})
df_filled.show()

# Удаление строк с NULL значениями в любых колонках
df_no_nulls = df.dropna()
df_no_nulls.show()

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|John|      Sales|  5000|
|Mike|      Sales|  4600|
|Sara|         HR|  4000|
| Jen|         HR|  4500|
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
+----+-----------+------+

+----+-----------+------+
|Name| Department|Salary|
+----+-----------+------+
|John|      Sales|  5000|
|Mike|      Sales|  4600|
|Sara|         HR|  4000|
| Jen|         HR|  4500|
|Jeff|Engineering|  7800|
| Tom|Engineering|  7300|
+----+-----------+------+



### Объединение DataFrame

In [13]:
# Создание второго DataFrame для примера
data2 = [("Jake", "Finance", 6100), ("Linda", "Finance", 6400)]
columns2 = ["Name", "Department", "Salary"]

df2 = spark.createDataFrame(data2, schema=columns2)

# Объединение двух DataFrame
df_union = df.union(df2)
df_union.show()

+-----+-----------+------+
| Name| Department|Salary|
+-----+-----------+------+
| John|      Sales|  5000|
| Mike|      Sales|  4600|
| Sara|         HR|  4000|
|  Jen|         HR|  4500|
| Jeff|Engineering|  7800|
|  Tom|Engineering|  7300|
| Jake|    Finance|  6100|
|Linda|    Finance|  6400|
+-----+-----------+------+



### Соединение (Join) DataFrame

In [14]:
# Создание второго DataFrame для примера соединения
data3 = [("John", "New York"), ("Mike", "Los Angeles"), ("Sara", "Chicago")]
columns3 = ["Name", "City"]

df3 = spark.createDataFrame(data3, schema=columns3)

# Соединение по колонке 'Name'
df_joined = df.join(df3, on="Name", how="inner")
df_joined.show()

+----+----------+------+-----------+
|Name|Department|Salary|       City|
+----+----------+------+-----------+
|John|     Sales|  5000|   New York|
|Mike|     Sales|  4600|Los Angeles|
|Sara|        HR|  4000|    Chicago|
+----+----------+------+-----------+



### Группировка

In [17]:
# Посчитаем среднюю зарплату по каждому отделу
df_grouped = df.groupBy("Department").avg("Salary")
df_grouped.show()

+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|      Sales|     4800.0|
|         HR|     4250.0|
|Engineering|     7550.0|
+-----------+-----------+



### Группировка с несколькими агрегатами

In [15]:
# Одновременное применение нескольких агрегаций
df_agg = df.groupBy("Department").agg(
    {'Salary': 'avg', 'Salary': 'max', 'Salary': 'min'}
)
df_agg.show()

+-----------+-----------+
| Department|min(Salary)|
+-----------+-----------+
|      Sales|       4600|
|         HR|       4000|
|Engineering|       7300|
+-----------+-----------+



### Использование SQL запросов

In [16]:
df.createOrReplaceTempView("employees")

# Выполнение SQL запроса
df_sql = spark.sql("SELECT Department, AVG(Salary) as Avg_Salary FROM employees GROUP BY Department")
df_sql.show()

+-----------+----------+
| Department|Avg_Salary|
+-----------+----------+
|      Sales|    4800.0|
|         HR|    4250.0|
|Engineering|    7550.0|
+-----------+----------+



### Создание и использование UDF

In [18]:
# Создадим функцию, которая будет добавлять к зарплате бонус в зависимости от отдела
def calculate_bonus(department):
    if department == "Sales":
        return 500
    elif department == "HR":
        return 300
    elif department == "Engineering":
        return 700
    return 0

In [19]:
# Регистрируем UDF в PySpark
bonus_udf = udf(calculate_bonus, IntegerType())

In [20]:
# Применяем UDF к колонке 'Department' и добавляем новую колонку 'Bonus'
df_with_bonus = df.withColumn("Bonus", bonus_udf(col("Department")))
df_with_bonus.show()

+----+-----------+------+-----+
|Name| Department|Salary|Bonus|
+----+-----------+------+-----+
|John|      Sales|  5000|  500|
|Mike|      Sales|  4600|  500|
|Sara|         HR|  4000|  300|
| Jen|         HR|  4500|  300|
|Jeff|Engineering|  7800|  700|
| Tom|Engineering|  7300|  700|
+----+-----------+------+-----+

