In [33]:
import pandas as pd
import random
import os
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import collections
import pymorphy3
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [34]:
import urllib.request
import json
from bs4 import BeautifulSoup
import unicodedata

In [38]:
tqdm.pandas()

In [39]:
pd.options.display.max_rows = 450

In [40]:
file = open("data/stop_words_russian.txt", "r")
stop_words = file.read().splitlines()

In [41]:
morph = pymorphy3.MorphAnalyzer()

In [42]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
set_seed(42)

In [43]:
df = pd.read_csv('datasets/foss/dataset_max_25.csv', index_col=0)
df.shape

(7980, 2)

In [44]:
category_index = {i[1]:i[0] for i in enumerate(df.category.unique())}
reverse_category_index = {i[0]:i[1] for i in enumerate(df.category.unique())}

In [45]:
category_index

{'WEB': 0,
 'SECURITY': 1,
 'INTROS': 2,
 'DATA_SCIENCE': 3,
 'ORG': 4,
 'KnD': 5,
 'OPENING': 6,
 'DEVOPS': 7,
 'GAMES': 8,
 'SPECIAL': 9,
 'USER': 10,
 'EVENTS': 11,
 'DEV': 12,
 'MOBILE': 13,
 'SYSADM': 14,
 'MULTIMEDIA': 15,
 'DATABASES': 16,
 'MESSENGERS': 17,
 'DIY': 18,
 'MANAGEMENT': 19,
 'HARDWARE': 20,
 'MISC': 21,
 'LAW': 22,
 'EDUCATION': 23,
 'HISTORY': 24}

In [46]:
labels = [category_index[i] for i in df.category.values]

In [47]:
def normalize_text(text):
    # приведение к нижнему регистру 
    lower_result = text.lower()
    # удаление url-адресов
    non_links_result = re.sub(r"\S*https?:\S*", "", text)
    # удаление emails
    non_emails_result = re.sub(r"\S*@\S*\s?", "", non_links_result)
    # удаление цифр
    non_numeric_result = ''.join([i for i in non_emails_result if not i.isdigit()])
    # удаление пунктуации и специальных символов
    non_punc_result = ''.join(filter(lambda mark: mark.isalnum() or mark.isspace(), non_numeric_result)).strip()
    # удаление лишних пробелов
    non_space_result = re.sub(r" +", " ", non_punc_result)
    # удаление лишних абзацев
    paragraph_list = non_space_result.split('\n')
    text = '\n'.join([p for p in paragraph_list if not p.count(' ') < 10])
    # удаление стоп-слов и длинных слов
    text_tokens = word_tokenize(text)
    without_stop_word_tokens = [word for word in text_tokens if not word in stop_words]
    without_long_tokens = [word for word in without_stop_word_tokens if len(word) <= 18]
    without_short_tokens = [word for word in without_long_tokens if len(word) >= 2]
    # лемматизация слов
    lemmatize_result = [morph.parse(word)[0].normal_form for word in without_short_tokens]
    filtered_text = (" ").join(lemmatize_result)
    
    return filtered_text  

In [48]:
texts = df['document'].progress_apply(normalize_text)

100%|███████████████████████████████████████| 7980/7980 [11:04<00:00, 12.01it/s]


In [14]:
'''vect = CountVectorizer(stop_words = stop_words, min_df = 3, max_df = 0.6).fit(texts)
vectorized_texts = vect.transform(texts)
vectorized_texts = TfidfTransformer().fit_transform(vectorized_texts)
print("vectorized texts:\n{}".format(repr(vectorized_texts)))'''

vectorized texts:
<7980x45329 sparse matrix of type '<class 'numpy.float64'>'
	with 2427053 stored elements in Compressed Sparse Row format>


In [50]:
vectorizer = TfidfVectorizer(stop_words = stop_words, min_df = 3, max_df = 0.6).fit(texts)
vec_texts = vectorizer.transform(texts)
print("vectorized texts:\n{}".format(repr(vec_texts)))

vectorized texts:
<7980x45329 sparse matrix of type '<class 'numpy.float64'>'
	with 2427053 stored elements in Compressed Sparse Row format>


In [51]:
def checkTopWordsWithClass(vectorized_texts, vect, df, category):
    
    tfidf_df = pd.DataFrame(vectorized_texts.toarray(), index=list(df['category']), columns=vect.get_feature_names_out())
    df_temp = tfidf_df.loc[category]
    
    list1 = list(dict(df_temp.sum(axis=0).nlargest(n=50)).keys())
    
    df_category = df_temp.stack().reset_index()
    
    df_category.columns = ['category', 'token', 'tfidf']
    df_category = df_category.sort_values(by=['tfidf'], ascending=False)
    
    list2 = list(df_category.drop_duplicates(subset=['token']).head(50)['token'])
    
    return set(list1 + list2)

    #tfidf_df = tfidf_df.stack().reset_index()
    #tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'category','level_1': 'token'})
    #df_category = tfidf_df[tfidf_df['category'] == category].sort_values(by=['tfidf'], ascending=False)

In [52]:
result = list([''])*len(category_index)
dictResult = dict()
for i, category in enumerate(category_index.keys()):
    set_res = checkTopWordsWithClass(vec_texts, vect, df, category)
    result[i] = set_res
    dictResult[category] = set_res
    print(len(result[i]))

91
98
90
96
99
91
90
93
80
98
97
89
94
89
91
90
82
78
94
75
86
96
77
82
70


In [11]:
def predictCategory(setTextTokens, categoriesKeywords):
    dictResult = dict()
    for category, keywords in categoriesKeywords.items():
        dictResult[category] = len(setTextTokens & keywords)
    
    result = dict(sorted(dictResult.items(), key=lambda item: item[1], reverse=True))
    return list(result.keys())[0:3]  
        

In [10]:
def getContentTextFromMinLen(url):
    """дополнительная функция для получения текстовых данных 
        с тега <body>"""
    
    userAgent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':userAgent,} 
    request = urllib.request.Request(url, None, headers) 
    response = urllib.request.urlopen(request)
    html = response.read()
    soup = BeautifulSoup(html)
    for data in soup(["script", "style"]):
        data.extract()
    allText = soup.body.get_text()
    splitText = allText.splitlines()
    lines = [line.strip() for line in splitText]
    cleanLines = [splitLine.strip() for line in lines for splitLine in line.split("  ")]
    text = "\n".join(line for line in cleanLines if line)
    
    return unicodedata.normalize("NFKD", text) 

In [54]:
with open("data/records.json", 'r', encoding='utf-8') as file:
    records = json.load(file)

In [56]:
df.shape

(76885, 11)

In [57]:
temp_df = df[df['category'].isna()] 

In [58]:
temp_df.shape

(68246, 11)

In [59]:
russian_df = temp_df.loc[(temp_df['language'] == 'RUSSIAN')]

In [60]:
russian_df.shape

(5130, 11)

In [61]:
russian_df.drop(['id', 'datetime', 'digest_number', 'state', 'title', 'description', 'type', 'keywords', 'language'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  russian_df.drop(['id', 'datetime', 'digest_number', 'state', 'title', 'description', 'type', 'keywords', 'language'], axis=1, inplace=True)


In [62]:
russian_df.shape

(5130, 2)

In [None]:
'''urls = []
for _, row in tqdm(russian_df.iterrows(), total=russian_df.shape[0]):
    print(row['url'])
    urls.append(row['url'])'''

In [65]:
contents = [] 
for _, row in tqdm(russian_df.iterrows(), total=russian_df.shape[0]):
    try:
        text = getContentTextFromMinLen(row['url'])
        contents.append(text)
    except Exception as exception:
        contents.append('')

100%|█████████████████████████████████████| 5130/5130 [1:07:03<00:00,  1.28it/s]


In [66]:
contents.count('')

159

In [67]:
russian_df['texts'] = contents

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  russian_df['texts'] = contents


In [69]:
russian_df.sample(3)

Unnamed: 0,category,url,texts
75999,,https://habr.com/ru/news/t/578618,Хабр\nβ Открыть список\nКак стать автором\nВсе...
32521,,https://habr.com/ru/news/t/577786,Хабр\nβ Открыть список\nКак стать автором\nВсе...
63133,,https://habr.com/ru/news/t/571842,Хабр\nβ Открыть список\nКак стать автором\nВсе...


In [72]:
edit_texts = []
for content in tqdm(contents):
    edit_texts.append(normalize_text(content))

100%|███████████████████████████████████████| 5130/5130 [04:18<00:00, 19.87it/s]


In [73]:
russian_df['normilized texts'] = edit_texts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  russian_df['normilized texts'] = edit_texts


In [74]:
russian_df.shape

(5130, 4)

In [77]:
df_n = russian_df.drop(russian_df[russian_df['normilized texts'] == ''].index, inplace = False)

In [78]:
df_n.shape

(4971, 4)

In [137]:
df_n['category1'] = ['']*df_n.shape[0]

In [138]:
df_n['category2'] = ['']*df_n.shape[0]

In [139]:
df_n.sample(3)

Unnamed: 0,category,url,texts,normilized texts,category1,category2
63660,ORG,https://habr.com/ru/news/t/572092,Хабр\nβ Открыть список\nКак стать автором\nВсе...,mailru group партнёрство основатель skillbox ч...,,
27582,EVENTS,https://habr.com/ru/news/t/575110,Хабр\nβ Открыть список\nКак стать автором\nВсе...,россииский школьник завоевать золото европеиск...,,
65330,INTROS,https://habr.com/ru/news/t/572746,Хабр\nβ Открыть список\nКак стать автором\nВсе...,россииский клиент visa дать возможность удалят...,,


In [87]:
vectr_texts = vectorizer.transform(list(df_n['normilized texts']))

In [88]:
df_one = pd.DataFrame(vectr_texts.toarray(), columns=vectorizer.get_feature_names_out())

In [89]:
df_one.shape

(4971, 45329)

In [140]:
i = 0
for _, row in tqdm(df_n.iterrows(), total=df_n.shape[0]):
    vectr = vectorizer.transform([row['normilized texts']])
    df_one = pd.DataFrame(vectr.toarray(), columns=vectorizer.get_feature_names_out())
    df_one = df_one.stack().reset_index()
    df_one = df_one.sort_values(by=[0], ascending=False)
    check = set(df_one.head(50)['level_1'])
    res = predictCategory(check, dictResult)
    row['category'] = res[0]
    row['category1'] = res[1]
    row['category2'] = res[2]

100%|███████████████████████████████████████| 4971/4971 [04:10<00:00, 19.81it/s]


In [156]:
def get_urls_predict_category(i):
    elem = list(df_n.groupby(by = 'category'))[i]
    print(elem[0])
    print(len(elem[1]['url']))
    print(elem[1]['url'])

In [152]:
df_new = pd.DataFrame(columns=['url', 'category'])

In [157]:
get_urls_predict_category(0)

DATABASES
77
1808     https://habr.com/ru/news/t/587616
8995     https://habr.com/ru/news/t/582948
9234       https://habr.com/ru/post/587216
9308       https://habr.com/ru/post/588526
11848      https://habr.com/ru/post/590351
12266    https://habr.com/ru/news/t/584446
16161    https://habr.com/ru/news/t/581962
17885      https://habr.com/ru/post/583014
18786    https://habr.com/ru/news/t/583482
19486    https://habr.com/ru/news/t/583704
21477    https://habr.com/ru/news/t/584698
22737    https://habr.com/ru/news/t/573186
23507    https://habr.com/ru/news/t/573464
23543      https://habr.com/ru/post/569256
25650      https://habr.com/ru/post/570330
26208      https://habr.com/ru/post/574548
28887    https://habr.com/ru/news/t/575530
28917    https://habr.com/ru/news/t/575822
31161    https://habr.com/ru/news/t/577130
31599    https://habr.com/ru/news/t/577322
32103    https://habr.com/ru/news/t/577344
32966    https://habr.com/ru/news/t/578002
35762      https://habr.com/ru/post/57766

In [None]:
32103, 8995, 11848, 16161, 28917, 31599, 37568, 42252, 42351, 42995, 43894, 44422, 44481, 45903, 45908, 
46116(здесь остановился)

In [36]:
df_new = pd.DataFrame(columns = ['url', 'category'])

In [18]:
texts = []
for _, row in df_new.iterrows():
    texts.append(getContentTextFromMinLen(row['url']))

Exception ignored in: <function tqdm.__del__ at 0x11f692660>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/tqdm/std.py", line 1162, in __del__
    self.close()
  File "/usr/local/lib/python3.11/site-packages/tqdm/notebook.py", line 288, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


In [21]:
df_new['document'] = texts

In [38]:
df_new.drop (columns=df_new.columns[1], axis= 1 , inplace= True )

In [29]:
df_new = df_new[['document', 'category']]

In [40]:
df_new.to_csv('predict_test_wp.csv')

In [37]:
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/538156', 'DEVOPS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/536732', 'DEVOPS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/587616', 'OPENING']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/587216', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/588526', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/584446', 'USER']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/583014', 'SYSADM']
#df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/583482', 'EVENTS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/583704', 'DATABASES']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/584698', 'SECURITY']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/573186', 'SECURITY']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/573464', 'INTROS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/569256', 'DEVOPS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/570330', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/574548', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/575530', 'DATABASES']
df_new.loc[len(df_new)] = ['https://habr.com/ru/news/t/578002', 'SEQURITY']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/577662', 'DEVOPS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/580040', 'DEV']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/592797', 'DEVOPS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/526470', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/521190', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/530774', 'SYSADM']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/530992', 'DEVOPS']
df_new.loc[len(df_new)] = ['https://habr.com/ru/post/532460', 'DEVOPS']

In [120]:
get_urls_predict_category(9)

HISTORY
22


7694     https://habr.com/ru/news/t/592827
8326     https://habr.com/ru/news/t/588943
10564    https://habr.com/ru/news/t/583622
16407    https://habr.com/ru/news/t/582276
20296    https://habr.com/ru/news/t/584124
21571    https://habr.com/ru/news/t/584752
30033    https://habr.com/ru/news/t/576504
30674    https://habr.com/ru/news/t/576848
32975    https://habr.com/ru/news/t/578066
34112    https://habr.com/ru/news/t/578682
35060    https://habr.com/ru/news/t/579234
37392    https://habr.com/ru/news/t/580480
37561    https://habr.com/ru/news/t/580542
37812    https://habr.com/ru/news/t/580616
42579      https://habr.com/ru/post/530480
51146      https://habr.com/ru/post/559566
51996    https://habr.com/ru/news/t/566054
54061    https://habr.com/ru/news/t/568372
61309    https://habr.com/ru/news/t/571226
68294    https://habr.com/ru/news/t/586420
71986    https://habr.com/ru/news/t/587210
75574    https://habr.com/ru/news/t/578402
Name: url, dtype: object