In [None]:
import os
import socket
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# S3
aws_access_key = "aws_access_key"
aws_secret_key = "aws_secret_key"
s3_bucket = "datasets"
s3_endpoint_url = "private"

APACHE_MASTER_IP = socket.gethostbyname("apache-spark-master-0.apache-spark-headless.apache-spark.svc.cluster.local")
APACHE_MASTER_URL = f"spark://{APACHE_MASTER_IP}:7077"
POD_IP = os.environ["MY_POD_IP"]

SPARK_APP_NAME = f"spark-{os.environ['HOSTNAME']}"  
JARS = """/nfs/env/lib/python3.8/site-packages/pyspark/jars/clickhouse-native-jdbc-shaded-2.6.5.jar, 
/nfs/env/lib/python3.8/site-packages/pyspark/jars/hadoop-aws-3.3.4.jar,
/nfs/env/lib/python3.8/site-packages/pyspark/jars/aws-java-sdk-bundle-1.12.433.jar,
/nfs/env/lib/python3.8/site-packages/pyspark/jars/postgresql-42.7.4.jar
"""

MEM = "512m"  
CORES = 1  

spark = SparkSession. \
    builder. \
    appName(SPARK_APP_NAME). \
    master("local"). \
    config("spark.executor.memory", MEM). \
    config("spark.jars", JARS). \
    config("spark.executor.cores", CORES). \
    config("spark.hadoop.fs.s3a.endpoint", s3_endpoint_url). \
    config("spark.hadoop.fs.s3a.access.key", aws_access_key). \
    config("spark.hadoop.fs.s3a.secret.key", aws_secret_key). \
    config("fs.s3a.endpoint", s3_endpoint_url). \
    config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"). \
    config("spark.hadoop.fs.s3a.path.style.access", True). \
    config("spark.hadoop.fs.s3a.committer.name", "directory"). \
    config("spark.hadoop.fs.s3a.aws.credentials.provider",
           "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"). \
    config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false"). \
    getOrCreate()



25/08/05 13:37:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
# Чтение данных из S3
users_df = spark.read.csv(
    "s3a://datasets/users.csv",
    header=True,
    inferSchema=True
)
cleaned_df = users_df.select(
    "user_id",
    "email",
    "phone_number",
    "registration_date",
    "last_login_date"
).dropDuplicates(["user_id"])

cleaned_df.show(truncate=False)

25/08/05 13:37:52 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


[Stage 2:>                                                          (0 + 1) / 1]

+-------+------------------------------+------------------+--------------------------+--------------------------+
|user_id|email                         |phone_number      |registration_date         |last_login_date           |
+-------+------------------------------+------------------+--------------------------+--------------------------+
|1      |johnsonjoshua@example.org     |+7 (939) 556-87-64|2023-10-10 18:40:32.720932|2024-03-11 03:37:48.321355|
|2      |jillrhodes@.miller.com        |947-57-66         |2023-05-22 10:09:50.223086|2024-12-08 17:18:27.988359|
|3      |williamjohnson@baldwin.net    |9312903351        |2023-01-20 13:39:12.144614|2023-03-07 18:44:03.436104|
|4      |lrobinson@example.com         |79573896693       |2024-09-23 18:59:08.298061|2024-11-19 15:48:39.948513|
|5      |blakeerik@example.com         |+79383642499      |2024-07-26 18:12:56.923251|2024-12-28 02:26:11.991295|
|6      |joshua35@example.org          |+79782446455      |2023-06-30 07:45:25.755943|20

                                                                                

In [3]:
# Очистка email
cleaned_df = cleaned_df.withColumn(
    "clean_email",
    lower(col("email"))
)

cleaned_df.select("email", "clean_email").show(truncate=False)

+------------------------------+------------------------------+
|email                         |clean_email                   |
+------------------------------+------------------------------+
|johnsonjoshua@example.org     |johnsonjoshua@example.org     |
|jillrhodes@.miller.com        |jillrhodes@.miller.com        |
|williamjohnson@baldwin.net    |williamjohnson@baldwin.net    |
|lrobinson@example.com         |lrobinson@example.com         |
|blakeerik@example.com         |blakeerik@example.com         |
|joshua35@example.org          |joshua35@example.org          |
|jamesmichael@MUNOZ-ROMAN.INFO |jamesmichael@munoz-roman.info |
|@martinez.com                 |@martinez.com                 |
| barbara10@.COM               | barbara10@.com               |
|kendragalloway@example.org    |kendragalloway@example.org    |
|jamesshawn@example.com        |jamesshawn@example.com        |
|mitchellclark@example.com     |mitchellclark@example.com     |
|lynchgeorge                   |lynchgeo

In [4]:
# Добавление is_valid_email
email_regex = "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
cleaned_df = cleaned_df.withColumn(
    "is_valid_email",
    col("clean_email").rlike(email_regex)
)

cleaned_df.select("clean_email", "is_valid_email").show(truncate=False)

+------------------------------+--------------+
|clean_email                   |is_valid_email|
+------------------------------+--------------+
|johnsonjoshua@example.org     |true          |
|jillrhodes@.miller.com        |true          |
|williamjohnson@baldwin.net    |false         |
|lrobinson@example.com         |true          |
|blakeerik@example.com         |true          |
|joshua35@example.org          |true          |
|jamesmichael@munoz-roman.info |true          |
|@martinez.com                 |false         |
| barbara10@.com               |false         |
|kendragalloway@example.org    |true          |
|jamesshawn@example.com        |true          |
|mitchellclark@example.com     |true          |
|lynchgeorge                   |false         |
|frankgray@example.net         |true          |
|gabriellecameron@example.org  |true          |
|lydiatrujillo@adams-clark.org |false         |
|jason76@example.net           |true          |
|ithomas@mcgee.com             |false   

In [5]:
# Удаление всех нецифровых символов
cleaned_df = cleaned_df.withColumn(
    "clean_phone",
    regexp_replace(col("phone_number"), "[^0-9]", "")
)

cleaned_df.select("phone_number", "clean_phone").show(truncate=False)

+------------------+-----------+
|phone_number      |clean_phone|
+------------------+-----------+
|+7 (939) 556-87-64|79395568764|
|947-57-66         |9475766    |
|9312903351        |9312903351 |
|79573896693       |79573896693|
|+79383642499      |79383642499|
|+79782446455      |79782446455|
|+79814052834      |79814052834|
|8-943-962-1674    |89439621674|
|79635859883       |79635859883|
|9936794356        |9936794356 |
|+79348542969      |79348542969|
|968-11-23         |9681123    |
|8 (946) 666-46-19 |89466664619|
|null              |null       |
|+79199648751      |79199648751|
|+79422838859      |79422838859|
|+7 (924) 119-66-50|79241196650|
|89589944387       |89589944387|
|+7 (988) 821-72-50|79888217250|
|null              |null       |
+------------------+-----------+
only showing top 20 rows



In [6]:
# Преобразование к международному формату
cleaned_df = cleaned_df.withColumn(
    "clean_phone",
    when(col("clean_phone").like("8%"), concat(lit("+7"), substring(col("clean_phone"), 2, 10)))
    .when(col("clean_phone").like("7%"), concat(lit("+"), col("clean_phone")))
    .when(col("clean_phone").like("9%"), concat(lit("+7"), col("clean_phone")))
    .otherwise(None)
)

cleaned_df.select("clean_phone").show(truncate=False)

+------------+
|clean_phone |
+------------+
|+79395568764|
|+79475766   |
|+79312903351|
|+79573896693|
|+79383642499|
|+79782446455|
|+79814052834|
|+79439621674|
|+79635859883|
|+79936794356|
|+79348542969|
|+79681123   |
|+79466664619|
|null        |
|+79199648751|
|+79422838859|
|+79241196650|
|+79589944387|
|+79888217250|
|null        |
+------------+
only showing top 20 rows



In [7]:
# Проверка корректности телефона
# - должен начинаться с +7
# - длина должна быть 12 символов (включая +7)
cleaned_df = cleaned_df.withColumn(
    "is_valid_phone",
    when(
        (col("clean_phone").like("+7%")) & (length(col("clean_phone")) == 12),
        True
    ).otherwise(False)
)

cleaned_df.select("clean_phone","is_valid_phone").show(truncate=False)

+------------+--------------+
|clean_phone |is_valid_phone|
+------------+--------------+
|+79395568764|true          |
|+79475766   |false         |
|+79312903351|true          |
|+79573896693|true          |
|+79383642499|true          |
|+79782446455|true          |
|+79814052834|true          |
|+79439621674|true          |
|+79635859883|true          |
|+79936794356|true          |
|+79348542969|true          |
|+79681123   |false         |
|+79466664619|true          |
|null        |false         |
|+79199648751|true          |
|+79422838859|true          |
|+79241196650|true          |
|+79589944387|true          |
|+79888217250|true          |
|null        |false         |
+------------+--------------+
only showing top 20 rows



In [8]:
# Добавить итоговый флаг валидности контактных данных
cleaned_df = cleaned_df.withColumn(
    "is_valid_contact",
    when(
        (col("is_valid_email") == True) & (col("is_valid_phone") == True),
        True
    ).otherwise(False)
)

cleaned_df.select("is_valid_email","is_valid_phone","is_valid_contact").show(truncate=False)

+--------------+--------------+----------------+
|is_valid_email|is_valid_phone|is_valid_contact|
+--------------+--------------+----------------+
|true          |true          |true            |
|true          |false         |false           |
|false         |true          |false           |
|true          |true          |true            |
|true          |true          |true            |
|true          |true          |true            |
|true          |true          |true            |
|false         |true          |false           |
|false         |true          |false           |
|true          |true          |true            |
|true          |true          |true            |
|true          |false         |false           |
|false         |true          |false           |
|true          |false         |false           |
|true          |true          |true            |
|false         |true          |false           |
|true          |true          |true            |
|false         |true

In [9]:
# Проверки корректности данных
wrong_email_format = cleaned_df.filter(
    col("is_valid_email") &
    (~col("clean_email").rlike("^[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$"))
).count()

wrong_phone_format = cleaned_df.filter(
    col("is_valid_phone") &
    (~col("clean_phone").startswith("+7") | (length(col("clean_phone")) != 12))
).count()

invalid_contact_flag = cleaned_df.filter(
    col("is_valid_contact") &
    (~col("is_valid_email") | ~col("is_valid_phone"))
).count()

In [10]:
# Проверяем результаты
assert wrong_email_format == 0, "Найдены некорректно очищенные email"
assert wrong_phone_format == 0, "Найдены некорректно очищенные телефоны"
assert invalid_contact_flag == 0, "Найдены ошибки в is_valid_contact"

In [11]:
users_df.select("email","phone_number").show(truncate=False)
cleaned_df.select("clean_email","clean_phone","is_valid_email","is_valid_phone","is_valid_contact").show(truncate=False)
cleaned_df.printSchema()
cleaned_df.select("clean_email", "clean_phone").where(col("is_valid_contact") == True).show(truncate=False)

+------------------------------+------------------+
|email                         |phone_number      |
+------------------------------+------------------+
|johnsonjoshua@example.org     |+7 (939) 556-87-64|
|jillrhodes@.miller.com        |947-57-66         |
|williamjohnson@baldwin.net    |9312903351        |
|lrobinson@example.com         |79573896693       |
|blakeerik@example.com         |+79383642499      |
|joshua35@example.org          |+79782446455      |
|jamesmichael@MUNOZ-ROMAN.INFO |+79814052834      |
|@martinez.com                 |8-943-962-1674    |
| barbara10@.COM               |79635859883       |
|kendragalloway@example.org    |9936794356        |
|jamesshawn@example.com        |+79348542969      |
|mitchellclark@example.com     |968-11-23         |
|lynchgeorge                   |8 (946) 666-46-19 |
|frankgray@example.net         |null              |
|gabriellecameron@example.org  |+79199648751      |
|lydiatrujillo@adams-clark.org |+79422838859      |
|jason76@exa

In [12]:
spark.stop()