In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pyspark.sql as sql
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.14.0 pyspark-shell'
conf = SparkConf().setAppName("lab2_2").setMaster('local')
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [3]:
languages = spark.read.csv("programming-languages.csv")
posts_sample = spark.read.format('xml').option('rowTag', 'row').load("posts_sample.xml")

In [5]:
languages.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



In [6]:
posts_sample.printSchema()

root
 |-- _AcceptedAnswerId: long (nullable = true)
 |-- _AnswerCount: long (nullable = true)
 |-- _Body: string (nullable = true)
 |-- _ClosedDate: timestamp (nullable = true)
 |-- _CommentCount: long (nullable = true)
 |-- _CommunityOwnedDate: timestamp (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _FavoriteCount: long (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _LastActivityDate: timestamp (nullable = true)
 |-- _LastEditDate: timestamp (nullable = true)
 |-- _LastEditorDisplayName: string (nullable = true)
 |-- _LastEditorUserId: long (nullable = true)
 |-- _OwnerDisplayName: string (nullable = true)
 |-- _OwnerUserId: long (nullable = true)
 |-- _ParentId: long (nullable = true)
 |-- _PostTypeId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Tags: string (nullable = true)
 |-- _Title: string (nullable = true)
 |-- _ViewCount: long (nullable = true)



In [7]:
#only need data
posts_sample_need = posts_sample.select(posts_sample['_Tags'], posts_sample['_CreationDate'], posts_sample['_ViewCount'])
lang_need = languages.select(languages['_c0']).collect()

In [8]:
posts_sample_need.show(10, False)

+------------------------------------------------------+-----------------------+----------+
|_Tags                                                 |_CreationDate          |_ViewCount|
+------------------------------------------------------+-----------------------+----------+
|<c#><floating-point><type-conversion><double><decimal>|2008-08-01 02:42:52.667|42817     |
|<html><css><internet-explorer-7>                      |2008-08-01 03:08:08.62 |18214     |
|null                                                  |2008-08-01 03:17:57.883|null      |
|<c#><.net><datetime>                                  |2008-08-01 04:40:59.743|555183    |
|<c#><datetime><time><datediff><relative-time-span>    |2008-08-01 04:55:37.967|149445    |
|null                                                  |2008-08-01 04:56:41.303|null      |
|<html><browser><timezone><user-agent><timezone-offset>|2008-08-01 05:42:38.903|176405    |
|<.net><math>                                          |2008-08-01 05:59:11.177|

In [9]:
lang_need[1][0]

'A# .NET'

Функция поиска языков программирования в строке тэгов

In [10]:
def find_language(x):
    if (x._Tags is None):
        return None
    for i in range(len(lang_need)):
        if "<" + str(lang_need[i][0]).lower() + ">" in str(x._Tags).lower():
            return lang_need[i][0]
    return None

Функция вычленения года из полной даты

In [11]:
def get_year(x):
    return str(x._CreationDate).split('-')[0]

Для каждой записи проводим поиск по тэгам на предмет наличия тэга с языками программирования и для каждой записи форматируем дату

In [12]:
posts = posts_sample_need.rdd.map(lambda x: (find_language(x), get_year(x), x[2])).toDF(["Language", "Year", "Count"])

In [13]:
posts.show(100, False)

+--------+----+------+
|Language|Year|Count |
+--------+----+------+
|null    |2008|42817 |
|null    |2008|18214 |
|null    |2008|null  |
|null    |2008|555183|
|null    |2008|149445|
|null    |2008|null  |
|null    |2008|176405|
|null    |2008|123231|
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|3650  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|617   |
|null    |2010|1315  |
|null    |2010|973   |
|Java    |2010|132   |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|null  |
|null    |2010|419   |
|null    |2010|null  |
|null    |2

Нужны только записи, где в тегах был язык программирования и count определен

In [21]:
posts_filter = posts.filter('Language is not null and Count is not null')

In [22]:
posts_filter.show(100, False)

+------------+----+-----+
|Language    |Year|Count|
+------------+----+-----+
|Java        |2010|132  |
|PHP         |2010|1258 |
|Ruby        |2010|9649 |
|C           |2010|2384 |
|PHP         |2010|1987 |
|Python      |2010|3321 |
|JavaScript  |2010|128  |
|AppleScript |2010|477  |
|PHP         |2010|1748 |
|PHP         |2010|998  |
|JavaScript  |2010|2095 |
|Sed         |2010|447  |
|Python      |2010|6558 |
|Java        |2010|214  |
|Objective-C |2010|852  |
|JavaScript  |2010|179  |
|R           |2010|6709 |
|PHP         |2010|78   |
|JavaScript  |2010|1280 |
|PHP         |2010|205  |
|C           |2010|261  |
|C           |2010|270  |
|C           |2011|168  |
|Objective-C |2011|899  |
|C           |2011|1611 |
|JavaScript  |2011|1216 |
|Java        |2011|1713 |
|C           |2011|245  |
|PHP         |2011|352  |
|PHP         |2011|85   |
|PHP         |2011|133  |
|Ruby        |2011|57   |
|JavaScript  |2011|1198 |
|ColdFusion  |2011|47   |
|Java        |2011|2130 |
|Java       

Складываем count записей по тэгу (ЯП) и сортруем записи от самых многочисленных до самых малочисленных

In [42]:
from datetime import datetime
final_result = {}
for year in range(2010, 2020):
    posts_year = posts_filter.filter('Year = ' + str(year))
    final_result[year] = posts_year.rdd\
        .map(lambda x: (x[0], x[2]))\
        .reduceByKey(lambda x1, x2: x1 + x2)\
        .sortBy(lambda x: x[1], ascending=False)\
        .toDF()

In [50]:
final_result[2010].show(5)

+-----------+-------+
|         _1|     _2|
+-----------+-------+
|        PHP|1184584|
|       Java| 563211|
| JavaScript| 316131|
|Objective-C|  97009|
|       Ruby|  76001|
+-----------+-------+
only showing top 5 rows



Делаем отчет для каждого года из диапазона

In [54]:
from pyspark.sql.functions import col
for year in range(2010, 2020):
    #final_result[year] = final_result[year].select(col("_1").alias("Language"), col("_2").alias(f"Count_{year}")).limit(10)
    final_result[year].show()
    final_result[year].write.format("parquet").save(f"{year}")

+-----------+----------+
|   Language|Count_2010|
+-----------+----------+
|        PHP|   1184584|
|       Java|    563211|
| JavaScript|    316131|
|Objective-C|     97009|
|       Ruby|     76001|
|          C|     66587|
|     Python|     59392|
|     MATLAB|     51865|
|AppleScript|     32305|
|     Delphi|     11817|
+-----------+----------+

+-----------+----------+
|   Language|Count_2011|
+-----------+----------+
| JavaScript|    806948|
|       Java|    388524|
|        PHP|    243646|
|          C|    238277|
|Objective-C|    218762|
|     Python|    195016|
|       Bash|     60805|
|       Ruby|     33037|
|       Perl|     24465|
|     MATLAB|     18816|
+-----------+----------+

+-----------+----------+
|   Language|Count_2012|
+-----------+----------+
|       Java|    661770|
| JavaScript|    571343|
|        PHP|    414479|
|     Python|    266658|
|       Ruby|    101824|
|Objective-C|     94438|
|          C|     69276|
|          R|     43917|
|      Scala|     24592

In [58]:
!hadoop fs -ls

Found 32 items
drwxr-xr-x   - vylerinna vylerinna          0 2023-12-17 22:15 .sparkStaging
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:08 2010
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:08 2011
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:08 2012
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2013
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2014
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2015
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2016
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2017
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2018
drwxr-xr-x   - vylerinna vylerinna          2 2023-12-17 23:09 2019
drwxr-xr-x   - vylerinna vylerinna          2 2023-11-20 19:23 data
drwxr-xr-x   - vylerinna vylerinna          0 2023-11-20 19:26 labs
-rwxr-xr-x   3 vylerinna vylerinna   79500408 2023-11-20 19:56 nyctaxi.csv
-rwxr-xr-x   3 vy

In [69]:
!hadoop fs -get "2019"  .