In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("MinIO")
    .master("local[*]")

    # путь к JAR'ам
    .config(
        "spark.jars",
        "/home/permyakoff/pyspark_env/spark-jars/hadoop-aws-3.3.4.jar,"
        "/home/permyakoff/pyspark_env/spark-jars/aws-java-sdk-bundle-1.12.262.jar"
    )

    # MinIO (S3A)
    .config("spark.hadoop.fs.s3a.endpoint", "http://host:9100")
    .config("spark.hadoop.fs.s3a.access.key", "login")
    .config("spark.hadoop.fs.s3a.secret.key", "password")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider"
    )

    .getOrCreate()
)

In [4]:
print(
    spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()
)


3.3.4


In [5]:
spark.read \
    .text("s3a://wordstat/") \
    .show(5)

26/01/19 11:42:28 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+--------------------+
|               value|
+--------------------+
|{"requestPhrase":...|
|{"requestPhrase":...|
+--------------------+



In [17]:
df = spark.read.json("s3a://wordstat/mos_ru_dynamics__2026_01_05.json")
df.show( truncate = False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+
|dynamics                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [26]:
from pyspark.sql.functions import explode, col

# Разворачиваем список dynamics в отдельные строки
df_exploded = df.withColumn("d", explode(col("dynamics")))

In [27]:
df_table = df_exploded.select(
    col("d.id").alias("количество"),
    col("d.date").alias("дата"),
    col("d.value").alias("доля")
)

df_table.show(truncate=False)

+----------+----------+--------------------+
|количество|дата      |доля                |
+----------+----------+--------------------+
|2167201   |2025-01-01|0.018262828060562616|
|2409113   |2025-02-01|0.02111202969283592 |
|2505235   |2025-03-01|0.02090259738612807 |
|2255646   |2025-04-01|0.019796875974182088|
|2032644   |2025-05-01|0.019040361608368556|
|2586177   |2025-06-01|0.02496476296189113 |
|2013103   |2025-07-01|0.019013607368042106|
|2174759   |2025-08-01|0.020894658032161985|
|3395272   |2025-09-01|0.02973035952361877 |
|2904798   |2025-10-01|0.024608724099023132|
|2439294   |2025-11-01|0.021029692754908083|
|2525126   |2025-12-01|0.022126720182650526|
+----------+----------+--------------------+

