In [None]:
import os
import sys
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

Настройка переменных окружения

In [None]:
os.environ['PYSPARK_PYTHON'] = sys.executable #устанавливает текущий интерпретатор Python
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable #устанавливает текущий интерпретатор Python для драйвера Spark
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.17.0 pyspark-shell' #подключение пакета для работы с XML в Spark и запуск PySpark в интерактивном режиме

In [None]:
spark = SparkSession.builder.appName("L2").getOrCreate()

In [None]:
if not os.path.exists('posts_sample.xml'):
    !wget https://git.ai.ssau.ru/tk/big_data/src/branch/bachelor/data/posts_sample.xml

posts_sample_data = spark.read.format('xml') \
    .option('rowTag', 'row') \
    .option("timestampFormat", 'yyyy/MM/dd HH:mm:ss') \
    .load('posts_sample.xml')

In [None]:
if posts_sample_data.isEmpty():
    print("DataFrame не содержит данных")
else:
    posts_sample_data.printSchema()
    posts_sample_data.show(5)

root
 |-- _AcceptedAnswerId: long (nullable = true)
 |-- _AnswerCount: long (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: long (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: long (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: long (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: long (nullable = true)
 |-- _ParentId: long (nullable = true)
 |-- _PostTypeId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)

+-----------------+------------+--------------------+-----------+------------

In [None]:
if not os.path.exists('programming-languages.csv'):
  !wget https://git.ai.ssau.ru/tk/big_data/src/branch/bachelor/data/programming-languages.csv

languages_data = spark.read.format('csv') \
    .option('header', 'true') \
    .option("inferSchema", True) \
    .load('programming-languages.csv') \
    .dropna()

In [None]:
if languages_data.isEmpty():
    print("DataFrame не содержит данных")
else:
    languages_data.printSchema()
    languages_data.show(5)

root
 |-- name: string (nullable = true)
 |-- wikipedia_url: string (nullable = true)

+----------+--------------------+
|      name|       wikipedia_url|
+----------+--------------------+
|   A# .NET|https://en.wikipe...|
|A# (Axiom)|https://en.wikipe...|
|A-0 System|https://en.wikipe...|
|        A+|https://en.wikipe...|
|       A++|https://en.wikipe...|
+----------+--------------------+
only showing top 5 rows



In [None]:
start_date, end_date = "2010-01-01", "2020-12-31"
posts_filtered = posts_sample_data.filter(col("_CreationDate").between(start_date, end_date))

posts_filtered.show(10)

+-----------------+------------+--------------------+-----------+-------------+--------------------+--------------------+--------------+-------+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+-----+------+----------+
|_AcceptedAnswerId|_AnswerCount|               _Body|_ClosedDate|_CommentCount| _CommunityOwnedDate|       _CreationDate|_FavoriteCount|    _Id|   _LastActivityDate|       _LastEditDate|_LastEditorDisplayName|_LastEditorUserId|_OwnerDisplayName|_OwnerUserId|_ParentId|_PostTypeId|_Score|_Tags|_Title|_ViewCount|
+-----------------+------------+--------------------+-----------+-------------+--------------------+--------------------+--------------+-------+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+-----+------+----------+
|             NULL|        NULL|<p>No. (And more ...|       NULL

In [None]:
language_names = [row['name'] for row in languages_data.collect()]
print(language_names)

def includes_name(row):
    creation_date = row["_CreationDate"]
    tags = str(row["_Tags"]).lower() if row["_Tags"] else ""
    for name in language_names:
        if f"<{name.lower()}>" in tags:
            return (creation_date, name)
    return (creation_date, 'None')

['A# .NET', 'A# (Axiom)', 'A-0 System', 'A+', 'A++', 'ABAP', 'ABC', 'ABC ALGOL', 'ABSET', 'ABSYS', 'ACC', 'Accent', 'Ace DASL', 'ACL2', 'ACT-III', 'Action!', 'ActionScript', 'Ada', 'Adenine', 'Agda', 'Agilent VEE', 'Agora', 'AIMMS', 'Alef', 'ALF', 'ALGOL 58', 'ALGOL 60', 'ALGOL 68', 'ALGOL W', 'Alice', 'Alma-0', 'AmbientTalk', 'Amiga E', 'AMOS', 'AMPL', 'Apex (Salesforce.com)', 'APL', "App Inventor for Android's visual block language", 'AppleScript', 'Arc', 'ARexx', 'Argus', 'AspectJ', 'Assembly language', 'ATS', 'Ateji PX', 'AutoHotkey', 'Autocoder', 'AutoIt', 'AutoLISP / Visual LISP', 'Averest', 'AWK', 'Axum', 'B', 'Babbage', 'Bash', 'BASIC', 'bc', 'BCPL', 'BeanShell', 'Batch (Windows/Dos)', 'Bertrand', 'BETA', 'Bigwig', 'Bistro', 'BitC', 'BLISS', 'Blockly', 'BlooP', 'Blue', 'Boo', 'Boomerang', 'Bourne shell (including', 'bash and', 'ksh )', 'BREW', 'BPEL', 'C', 'C--', 'C++ – ISO/IEC 14882', 'C# – ISO/IEC 23270', 'C/AL', 'Caché ObjectScript', 'C Shell', 'Caml', 'Cayenne', 'CDuce', 'C

In [None]:
posts_filtered_rdd = posts_filtered.rdd \
    .map(includes_name) \
    .filter(lambda x: x[1] != 'None')

posts_aggregate = posts_filtered_rdd \
    .keyBy(lambda row: (row[0].year, row[1])) \
    .aggregateByKey(0, lambda x, _: x + 1, lambda x1, x2: x1 + x2) \
    .sortBy(lambda x: x[1], ascending=False) \
    .collect()

years = list(range(2010, 2021))
years_df = []

for year in years:
    first_languages = [row for row in posts_aggregate if row[0][0] == year][:10]
    years_df.extend(first_languages)

row_name = Row('Year', 'Language', 'Count')
result_df = spark.createDataFrame([
    row_name(year, lang, count) for ((year, lang), count) in years_df
])

result_df.show(100 ,truncate=False)
result_df.write.mode("overwrite").parquet("result_lang.parquet")

+----+-----------+-----+
|Year|Language   |Count|
+----+-----------+-----+
|2010|Java       |52   |
|2010|JavaScript |44   |
|2010|PHP        |42   |
|2010|Python     |25   |
|2010|Objective-C|23   |
|2010|C          |20   |
|2010|Ruby       |11   |
|2010|Delphi     |7    |
|2010|R          |3    |
|2010|AppleScript|3    |
|2011|PHP        |97   |
|2011|Java       |92   |
|2011|JavaScript |82   |
|2011|Python     |35   |
|2011|Objective-C|33   |
|2011|C          |24   |
|2011|Ruby       |17   |
|2011|Delphi     |8    |
|2011|Perl       |8    |
|2011|Bash       |7    |
|2012|PHP        |136  |
|2012|JavaScript |129  |
|2012|Java       |124  |
|2012|Python     |65   |
|2012|Objective-C|45   |
|2012|C          |27   |
|2012|Ruby       |25   |
|2012|R          |9    |
|2012|Bash       |9    |
|2012|MATLAB     |6    |
|2013|JavaScript |196  |
|2013|Java       |191  |
|2013|PHP        |173  |
|2013|Python     |87   |
|2013|Objective-C|40   |
|2013|C          |36   |
|2013|Ruby       |30   |
