### List comprehension

In [80]:
a = [1,2,3,4,5,6,7,8,9,10]
[item/2 for item in a if item %2 == 0]

[1.0, 2.0, 3.0, 4.0, 5.0]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymorphy2

### Создание морфологического анализатора

In [3]:
def pymorphy2_311_hotfix():
    from inspect import getfullargspec
    from pymorphy2.units.base import BaseAnalyzerUnit

    def _get_param_names_311(klass):
        if klass.__init__ is object.__init__:
            return []
        args = getfullargspec(klass.__init__).args
        return sorted(args[1:])

    setattr(BaseAnalyzerUnit, '_get_param_names', _get_param_names_311)

In [14]:
pymorphy2_311_hotfix()
morph = pymorphy2.MorphAnalyzer()

In [15]:
type(morph)

pymorphy2.analyzer.MorphAnalyzer

In [43]:
word = "Питон"
morph.parse(word)

[Parse(word='питон', tag=OpencorporaTag('NOUN,anim,masc sing,nomn'), normal_form='питон', score=1.0, methods_stack=((DictionaryAnalyzer(), 'питон', 52, 0),))]

In [12]:
morph.parse(word)[0]

Parse(word='питон', tag=OpencorporaTag('NOUN,anim,masc sing,nomn'), normal_form='питон', score=1.0, methods_stack=((DictionaryAnalyzer(), 'питон', 52, 0),))

In [13]:
parsed_word = morph.parse(word)[0]
lemma = parsed_word.normal_form

In [14]:
lemma

'питон'

In [15]:
gender = parsed_word.tag.gender
gender

'masc'

In [16]:
number = parsed_word.tag.number
number

'sing'

In [17]:
print("Исходное слово:", word)
print("Лемма:", lemma)
print("Род:", gender)
print("Число:", number)

Исходное слово: Питон
Лемма: питон
Род: masc
Число: sing


In [20]:
word = "Экранное"
morph.parse(word)
parsed_word = morph.parse(word)[0]
lemma = parsed_word.normal_form
gender = parsed_word.tag.gender
number = parsed_word.tag.number
print("Исходное слово:", word)
print("Лемма:", lemma)
print("Род:", gender)
print("Число:", number)

Исходное слово: Экранное
Лемма: экранный
Род: neut
Число: sing


### Создание клиента

In [4]:
from dask.distributed import Client

In [5]:
client = Client(n_workers = 4, threads_per_worker = 4, processes = True, memory_limit = '2GB')

In [6]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 7.45 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:57931,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 7.45 GiB

0,1
Comm: tcp://127.0.0.1:57950,Total threads: 4
Dashboard: http://127.0.0.1:57954/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:57934,
Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-bd6rubl3,Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-bd6rubl3

0,1
Comm: tcp://127.0.0.1:57952,Total threads: 4
Dashboard: http://127.0.0.1:57958/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:57936,
Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-mqhn2v6j,Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-mqhn2v6j

0,1
Comm: tcp://127.0.0.1:57951,Total threads: 4
Dashboard: http://127.0.0.1:57956/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:57938,
Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-8_exh5ye,Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-8_exh5ye

0,1
Comm: tcp://127.0.0.1:57953,Total threads: 4
Dashboard: http://127.0.0.1:57960/status,Memory: 1.86 GiB
Nanny: tcp://127.0.0.1:57940,
Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-p_bhf3y6,Local directory: C:\Users\D\AppData\Local\Temp\dask-scratch-space\worker-p_bhf3y6


### Dask bag

In [7]:
import dask.bag as db

In [25]:
data = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

In [26]:
type(data)

dask.bag.core.Bag

In [28]:
data.compute()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [32]:
transformed_data = data.map(lambda x: x * 2)
transformed_data.compute()

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

In [33]:
filtered_data = transformed_data.filter(lambda x: x % 4 == 0)
filtered_data.compute()

[4, 8, 12, 16, 20]

In [34]:
total = filtered_data.sum().compute()

In [35]:
total

60

### Частота встречаемых слов

In [36]:
with open("input.txt", "w") as file:
    file.write("Hello world\n")
    file.write("This is a sample input file for MapReduce\n")
    file.write("MapReduce is a powerful data processing model\n")

In [37]:
data = db.read_text('input.txt')

In [38]:
data.compute()

['Hello world\n',
 'This is a sample input file for MapReduce\n',
 'MapReduce is a powerful data processing model\n']

In [40]:
data.str.split().compute()

[['Hello', 'world'],
 ['This', 'is', 'a', 'sample', 'input', 'file', 'for', 'MapReduce'],
 ['MapReduce', 'is', 'a', 'powerful', 'data', 'processing', 'model']]

In [42]:
words = data.str.split().flatten()

In [43]:
words.compute()

['Hello',
 'world',
 'This',
 'is',
 'a',
 'sample',
 'input',
 'file',
 'for',
 'MapReduce',
 'MapReduce',
 'is',
 'a',
 'powerful',
 'data',
 'processing',
 'model']

In [44]:
word_count = words.frequencies()

In [45]:
word_count.compute()

[('Hello', 1),
 ('world', 1),
 ('This', 1),
 ('is', 2),
 ('a', 2),
 ('sample', 1),
 ('input', 1),
 ('file', 1),
 ('for', 1),
 ('MapReduce', 2),
 ('powerful', 1),
 ('data', 1),
 ('processing', 1),
 ('model', 1)]

In [46]:
data = db.read_text('input.txt').str.split()
words = data.flatten()
word_count = words.frequencies()
pd.DataFrame(word_count, columns=['Word', 'Frequency'])

Unnamed: 0,Word,Frequency
0,Hello,1
1,world,1
2,This,1
3,is,2
4,a,2
5,sample,1
6,input,1
7,file,1
8,for,1
9,MapReduce,2


### Ключевые слова

In [8]:
import string

data = {"URL": "https://example.com/page1",
    "Заголовок": "Новости спорта",
    "Текст": "Сегодня в новостях у нас спорт, спорт и еще раз спорт 4."}

In [53]:
data

{'URL': 'https://example.com/page1',
 'Заголовок': 'Новости спорта',
 'Текст': 'Сегодня в новостях у нас спорт, спорт и еще раз спорт 4.'}

In [54]:
text = data["Заголовок"] + " " + data["Текст"]
text

'Новости спорта Сегодня в новостях у нас спорт, спорт и еще раз спорт 4.'

In [55]:
text.lower()

'новости спорта сегодня в новостях у нас спорт, спорт и еще раз спорт 4.'

In [56]:
text.lower().translate(str.maketrans('', '', string.punctuation)).translate(str.maketrans('', '', string.digits))

'новости спорта сегодня в новостях у нас спорт спорт и еще раз спорт '

In [58]:
text.lower().translate(str.maketrans('', '', string.punctuation)).translate(str.maketrans('', '', string.digits)).split()

['новости',
 'спорта',
 'сегодня',
 'в',
 'новостях',
 'у',
 'нас',
 'спорт',
 'спорт',
 'и',
 'еще',
 'раз',
 'спорт']

In [59]:
words = text.lower().translate(str.maketrans('', '', string.punctuation)).translate(str.maketrans('', '', string.digits)).split()

In [76]:
[morph.parse(word)[0].normal_form for word in words]

['новость',
 'спорт',
 'сегодня',
 'в',
 'новость',
 'у',
 'мы',
 'спорт',
 'спорт',
 'и',
 'ещё',
 'раз',
 'спорт']

In [77]:
lemmatized_words = [morph.parse(word)[0].normal_form for word in words]

In [78]:
[word for word in lemmatized_words if len(word) >= 3]

['новость',
 'спорт',
 'сегодня',
 'новость',
 'спорт',
 'спорт',
 'ещё',
 'раз',
 'спорт']

In [81]:
def process_text(data):
    pymorphy2_311_hotfix()
    morph = pymorphy2.MorphAnalyzer()
    text = data["Заголовок"] + " " + data["Текст"]
    words = text.lower().translate(str.maketrans('', '', string.punctuation)).translate(str.maketrans('', '', string.digits)).split()
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    return [word for word in lemmatized_words if len(word) >= 3]

In [83]:
bag = db.from_sequence([data])
keywords = bag.map(process_text)
all_keywords = keywords.flatten()
keyword_counts = all_keywords.frequencies()
pd.DataFrame(keyword_counts, columns=['Word', 'Frequency']).sort_values(by="Frequency", ascending = False)

Unnamed: 0,Word,Frequency
1,спорт,4
0,новость,2
2,сегодня,1
3,ещё,1
4,раз,1


### Текстовый анализ расшифровки лекции

In [85]:
data = db.read_text(r"C:\Courses\Innopolis\Расшифровки\Cons_2024-11-11.txt")

In [94]:
words = data.str.split().flatten()
word_count = words.frequencies()
counts = pd.DataFrame(word_count, columns=['Word', 'Frequency']).sort_values(by = 'Frequency', ascending = False)
counts 

Unnamed: 0,Word,Frequency
144,Иннополис,625
143,Университет,625
200,вот,238
385,мы,125
193,у,122
...,...,...
2997,"смысле,",1
2996,увидел,1
2995,списке,1
2994,20:05:57:,1


In [107]:
#text = ' '.join(words.compute())
words = text.lower().translate(str.maketrans('', '', string.punctuation)).translate(str.maketrans('', '', string.digits)).split()
long_words = [word for word in words if len(word) >= 3]
lemmatized_words = [morph.parse(word)[0].normal_form for word in long_words]

In [113]:
keywords = db.from_sequence(lemmatized_words)
keyword_counts = keywords.frequencies()
word_counts = pd.DataFrame(keyword_counts, columns=['Word', 'Frequency']).sort_values(by="Frequency", ascending = False)



In [116]:
word_counts.head(50)

Unnamed: 0,Word,Frequency
107,университет,626
108,иннополис,625
142,вот,360
157,это,139
160,что,128
298,мы,123
11,быть,106
73,как,106
113,так,105
164,есть,79


### Homework

In [31]:
import bs4
import requests
import string

url = "http://az.lib.ru/t/tolstoj_lew_nikolaewich/text_0040.shtml"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text,'lxml')
text = soup.body.get_text(' ', strip=True)