1. Напишите программу, которая находит самое длинное слово

In [73]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import re
from collections import defaultdict
from operator import add

conf = SparkConf().setAppName('LongestWord')
sc = SparkContext.getOrCreate(conf)
spark = SparkSession(sc)

wiki_rdd = sc.textFile("wiki.txt")

def longest_word_in_line(line):
    try:
        fields = line.split('\t')  
        if len(fields) >= 3: 
            text = fields[2]  
            words = re.split(r'\W+', text)  
            longest_word = max((word for word in words if '.' not in word and '/' not in word), key=len)
            return (len(longest_word), longest_word)
        return (0, '') 
    except Exception as e:
        print(f"Error processing line: {line}")
        print(f"Error was: {e}")
        return (0, '')

longest_word = wiki_rdd.map(longest_word_in_line).reduce(lambda a, b: a if a[0] > b[0] else b)
print(f'The longest word: {longest_word[1]}')

The longest word: Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz


2. Напишите программу, которая находит среднюю длину слов.

In [74]:
def word_lengths(line):
    fields = line.split('\t')
    if len(fields) >= 3:
        text = fields[2]
        words = re.split(r'\W+', text)
        return [(len(word), 1) for word in words if '.' not in word and '/' not in word]
    return [(0, 0)]

word_lengths_rdd = wiki_rdd.flatMap(word_lengths)
total_length, total_words = word_lengths_rdd.reduce(lambda a, b: (a[0] + b[0], a[1] + b[1]))
average_length = total_length / total_words
print(f'The average length of words: {average_length}')

The average length of words: 6.065173717290537


3. Напишите программу, которая находит самое частоупотребляемое слово, состоящее из латинских букв.

In [75]:
def is_latin(word):
    return bool(re.match('^[a-zA-Z]+$', word))

def word_counts(line):
    fields = line.split('\t')
    if len(fields) >= 3:
        text = fields[2]
        words = re.split(r'\W+', text)
        return [(word.lower(), 1) for word in words if is_latin(word) and len(word) > 0 and not any(char in word for char in ['I', 'V', 'X', 'L', 'C', 'D', 'M'])]
    return []

word_counts_rdd = wiki_rdd.flatMap(word_counts)
word_counts_reduced = word_counts_rdd.reduceByKey(lambda a, b: a + b)
most_common_word = word_counts_reduced.max(lambda x: x[1])
print(f'The most common Latin word: {most_common_word[0]}')


The most common Latin word: the


4. Все слова, которые более чем в половине случаев начинаются с большой буквы и встречаются больше 10 раз.

In [76]:
def find_words(line):
    word_counts = defaultdict(int)
    word_starts_with_uppercase = defaultdict(int)

    fields = line.split('\t')
    if len(fields) >= 3:
        text = fields[2]
        words = re.findall(r'\b\w+\b', text)
        for word in words:
            word_counts[word.lower()] += 1
            if word[0].isupper():
                word_starts_with_uppercase[word.lower()] += 1

    result = []
    for word, count in word_counts.items():
        if count > 10 and word_starts_with_uppercase[word] / count > 0.5:
            result.append((word, word_starts_with_uppercase[word], count))

    return result

matching_words = wiki_rdd.flatMap(find_words).map(lambda x: (x[0], (x[1], x[2])))
matching_words = matching_words.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

for word, (starts_with_uppercase_count, total_count) in matching_words.collect():
    print(f'{word}, Uppercase: {starts_with_uppercase_count}, Total: {total_count}')


после, Uppercase: 584, Total: 985
русь, Uppercase: 94, Total: 100
олег, Uppercase: 38, Total: 38
пётр, Uppercase: 329, Total: 329
i, Uppercase: 1345, Total: 1357
эти, Uppercase: 6, Total: 11
нэпа, Uppercase: 17, Total: 17
временное, Uppercase: 29, Total: 30
п, Uppercase: 441, Total: 457
московского, Uppercase: 39, Total: 51
а, Uppercase: 1810, Total: 2668
советов, Uppercase: 140, Total: 147
сибири, Uppercase: 112, Total: 112
столыпина, Uppercase: 81, Total: 81
столыпин, Uppercase: 81, Total: 81
союз, Uppercase: 254, Total: 254
сср, Uppercase: 237, Total: 237
совета, Uppercase: 249, Total: 264
содружества, Uppercase: 64, Total: 67
германией, Uppercase: 16, Total: 16
великобритании, Uppercase: 166, Total: 166
гитлер, Uppercase: 110, Total: 110
горбачёва, Uppercase: 46, Total: 46
горбачёв, Uppercase: 79, Total: 79
азербайджанской, Uppercase: 39, Total: 49
нкао, Uppercase: 33, Total: 33
санкт, Uppercase: 328, Total: 328
петербурге, Uppercase: 49, Total: 49
тэц, Uppercase: 36, Total: 36
дво

5. Напишите программу, которая с помощью статистики определяет устойчивые сокращения вида пр., др., ...

In [77]:
def find_abbreviations(line):
    abbreviations = defaultdict(int)

    abbreviation_pattern = r'\b\w+\.'

    found_abbreviations = re.findall(abbreviation_pattern, line)

    for abbreviation in found_abbreviations:
       
        if not any(char.isdigit() for char in abbreviation):
            abbreviations[abbreviation] += 1

    return abbreviations.items()

abbreviation_counts = wiki_rdd.flatMap(find_abbreviations).reduceByKey(lambda a, b: a + b)
merged_abbreviations = abbreviation_counts.collectAsMap()
filtered_abbreviations = [(abbreviation, count) for abbreviation, count in merged_abbreviations.items() if len(abbreviation) <= 3]

for abbreviation, count in filtered_abbreviations:
    print(f"{abbreviation}: {count} times")

ru.: 12644 times
г.: 9937 times
см.: 1650 times
т.: 3258 times
п.: 1021 times
н.: 4704 times
э.: 4141 times
др.: 2430 times
юг.: 52 times
гг.: 2350 times
И.: 3489 times
А.: 5643 times
Г.: 1554 times
Б.: 1016 times
ЦК.: 25 times
пр.: 230 times
и.: 88 times
З.: 83 times
ТВ.: 16 times
Од.: 4 times
я.: 14 times
её.: 127 times
По.: 11 times
кн.: 51 times
fr.: 2 times
ЕП.: 1 times
Oc.: 1 times
K.: 94 times
Св.: 199 times
у.: 22 times
га.: 213 times
A.: 188 times
ю.: 82 times
е.: 386 times
P.: 71 times
O.: 23 times
Вл.: 16 times
Гг.: 1 times
ж.: 26 times
co.: 3 times
нм.: 45 times
уд.: 1 times
R.: 100 times
эВ.: 11 times
n.: 24 times
мч.: 1 times
рт.: 34 times
СИ.: 35 times
Zn.: 2 times
рН.: 3 times
эп.: 3 times
Ис.: 3 times
μ.: 4 times
AB.: 6 times
v.: 24 times
p.: 21 times
Ц.: 13 times
pd.: 1 times
Эй.: 2 times
ua.: 3 times
f.: 12 times
Gr.: 1 times
Rs.: 1 times
Љ.: 2 times
os.: 3 times
c.: 30 times
CD.: 12 times
Си.: 23 times
x.: 19 times
ІУ.: 1 times
Ae.: 1 times
BD.: 3 times
El.: 1 times

6. Напишите программу, которая с помощью статистики определяет устойчивые сокращения вида т.п., н.э., ...

In [78]:
pattern = re.compile(r'\b[а-яА-Яa-zA-Z]{1,3}\.[а-яА-Яa-zA-Z]{1,3}\.')

def find_abbreviations(text):
    abbreviations = pattern.findall(text)
    return [abbr for abbr in abbreviations if not abbr.startswith('www.')]

def abbreviations_in_line(line):
    fields = line.split('\t')
    if len(fields) >= 3:
        text = fields[2]
        return find_abbreviations(text)
    else:
        return []

abbreviations_rdd = wiki_rdd.flatMap(abbreviations_in_line)
abbreviation_counts = abbreviations_rdd.map(lambda x: (x, 1)).reduceByKey(add)
results = abbreviation_counts.collect()
for abbreviation, count in results:
    print(f'{abbreviation}: {count} times')

н.э.: 82 times
у.и.: 1 times
к.вв.: 1 times
ф.ст.: 1 times
м.д.: 3 times
а.е.: 5 times
М.К.: 4 times
Г.Г.: 1 times
М.Г.: 2 times
Ф.Б.: 1 times
D.R.: 4 times
ср.лат.: 1 times
ед.ч.: 14 times
г.г.: 20 times
т.е.: 88 times
т.к.: 27 times
ст.ст.: 10 times
нв.ст.: 4 times
I.Ae.: 1 times
Б.М.: 1 times
д.н.: 6 times
Д.И.: 5 times
Е.В.: 2 times
d.d.: 1 times
S.T.: 5 times
J.F.: 1 times
Н.А.: 8 times
л.с.: 35 times
В.А.: 3 times
R.U.: 2 times
млн.чел.: 4 times
M.B.: 1 times
Т.е.: 4 times
т.ч.: 17 times
Н.И.: 3 times
Л.Л.: 2 times
н.у.: 2 times
Б.Н.: 1 times
к.х.: 2 times
л.мн.: 3 times
s.a.: 1 times
Н.С.: 3 times
М.X.: 1 times
P.G.: 1 times
M.X.: 2 times
D.O.: 1 times
Г.О.: 1 times
R.An.: 1 times
fom.ru.: 1 times
И.А.: 3 times
com.ru.: 1 times
F.E.: 1 times
N.W.: 1 times
э.д.: 11 times
д.п.: 2 times
млн.руб.: 17 times
Н.В.: 5 times
W.A.: 2 times
G.B.: 1 times
R.B.: 1 times
Nb.Fz.: 1 times
C.G.: 4 times
рт.ст.: 9 times
тыс.руб.: 4 times
А.З.: 1 times
Н.Э.: 2 times
M.U.: 1 times
г.А.: 1 times
ед.