In [1]:
import pandas as pd
import re
from tqdm import tqdm
import nltk
from nltk import word_tokenize
import string
from nltk.corpus import stopwords
from pymystem3 import Mystem
from nltk.stem.snowball import SnowballStemmer
import warnings
import pickle
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertForSequenceClassification

try:
    from rutermextract import TermExtractor
except ImportError:
    print("Библиотека rutermextract не найдена. Устанавливаю...")
    os.system('pip install rutermextract')
    try:
        from rutermextract import TermExtractor
    except ImportError:
        print("Не удалось установить библиотеку rutermextract.")

warnings.filterwarnings('ignore')

stemmer = SnowballStemmer("russian")
mystem = Mystem()
nltk.download('punkt')
nltk.download('stopwords')

path = r'C:\work\10\КФП Новости'  # Убедитесь, что путь корректный

# Загрузка предобученной модели с Hugging Face
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny-sentiment-balanced')
model = AutoModelForSequenceClassification.from_pretrained('cointegrated/rubert-tiny-sentiment-balanced')

# Использование локальной модели
tokenizer_cbr = BertTokenizer.from_pretrained(r'C:\work\10\КФП Новости\Модели\rubert-tiny-sentiment-balanced')
model_cbr = BertForSequenceClassification.from_pretrained(r'C:\work\10\КФП Новости\Модели\rubert-tiny-sentiment-balanced')

term_extractor = TermExtractor()

def remove_punctuation(text):
    cleantext = re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)
    return "".join([ch if ch not in string.punctuation else ' ' for ch in cleantext])

def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])

def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

russian_stopwords = stopwords.words("russian")
english_stopwords = stopwords.words("english")
russian_stopwords.extend(['…', '«', '»', '...', '–'])

def load_model(file_path):
    with open(file_path, 'rb') as model_file:
        return pickle.load(model_file)

def get_sentiment(M, T, text, return_type):
    with torch.no_grad():
        inputs = T(text, return_tensors='pt', truncation=True, padding=True).to(M.device)
        proba = torch.sigmoid(M(**inputs).logits).cpu().numpy()[0]
    if return_type == 'label':
        return M.config.id2label[proba.argmax()]
    elif return_type == 'score':
        return proba.dot([-1, 0, 1])

# Функция для сохранения пустого датафрейма с заголовками
def save_empty_excel(df, path, file_name, class_):
    if df.empty:
        df.to_excel(f'{path}\После предобработки\{str(file_name).split(".")[0]}\{str(file_name).split(".")[0]}_{class_}.xlsx', index=False)
        return True
    return False

model_1 = load_model(f'{path}\Модели\Этап 1.pkl')
model_2 = load_model(f'{path}\Модели\Этап 2.pkl')
model_3_4 = load_model(f'{path}\Модели\Этап 3-4.pkl')
model_5 = load_model(f'{path}\Модели\Этап 5.pkl')
model_6 = load_model(f'{path}\Модели\Этап 6.pkl')
model_7 = load_model(f'{path}\Модели\Этап 7.pkl')



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\днс\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\днс\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\work\10\КФП Новости\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\work\10\КФП Новости\.venv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instan

In [3]:
for file_name in os.listdir(f'{path}\До предобработки'):
    df = pd.read_excel(f'{path}\До предобработки/{str(file_name)}')
    print(f'[+] {str(file_name)}')

    print('\t[!] Удаляем дубликаты и пустоты:')
    print('\t\t[-] До удаления:', len(df))
    df = df.drop_duplicates(subset=['Заголовок'])
    df = df.drop_duplicates(subset=['Текст'])
    df = df.dropna(subset=['Дата издания', 'Заголовок', 'Текст'])
    print('\t\t[-] После удаления:', len(df))

    stemmed_texts_list = []
    print('\t[!] Начинаем стэмминг:')
    for text in tqdm(df['text_prep']):
        tokens = word_tokenize(text)
        stemmed_tokens = [stemmer.stem(token) for token in tokens if token not in russian_stopwords and token not in english_stopwords]
        text = " ".join(stemmed_tokens)
        stemmed_texts_list.append(text)
    df['text_stem'] = stemmed_texts_list

    print('\t[!] Удаление коротких статей:')
    len_1 = len(df)
    df = df[df['text_stem'].apply(lambda x: x.split(' ').__len__()) > 20]
    df = df.dropna().reset_index(drop=True)
    print(f'\t\t[?] Удалено статей: {len_1 - len(df)}')

    sw_texts_list = []
    print('\t[!] Начинаем удалять стоп-слова:')
    for text in tqdm(df['text_prep']):
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token not in russian_stopwords and token != ' ' and token not in english_stopwords]
        text = " ".join(tokens)
        sw_texts_list.append(text)
    df['text_sw'] = sw_texts_list

    lemm_texts_list = []
    print('\t[!] Начинаем лемматизацию:')
    for text in tqdm(df['text_sw']):
        text_lem = mystem.lemmatize(text)
        tokens = [token for token in text_lem if token != ' ' and token not in russian_stopwords and token not in english_stopwords]
        text = " ".join(tokens)
        lemm_texts_list.append(text)
    df['text_lemm'] = lemm_texts_list

    print('\t[*] Определение класса:')
    if os.path.isdir(f'{path}\После предобработки/{str(file_name).split(".")[0]}') == False:
        os.mkdir(f'{path}\После предобработки/{str(file_name).split(".")[0]}')

    print(f'\t\t[1] Прочее:')
    if save_empty_excel(df, path, file_name, 'Спрос'):
        print(f'\t\t\t[!] df пуст, файл сохранен.')
        df_1 = df
    else:
        df[f'stage_1'] = pd.DataFrame(model_1.predict_proba(df['text_stem']))[0]
        df[f'stage_1_b'] = [1 if df[f'stage_1'][item] * 100 < 50.0 else 0 for item in range(len(df[f'stage_1']))]
        df_p = df.loc[df[f'stage_1_b'] == 1]
        print(f'\t\t\t[?] Найдено {len(df_p)} статей по теме')
        print('\t\t\t[!] Сохраняем Excel')
        df_p.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Прочее.xlsx', index=False)
        df_1 = df.loc[df[f'stage_1_b'] == 0]
        df_1 = df_1.drop([f'stage_1', f'stage_1_b'], axis=1).reset_index(drop=True)

    print(f'\t\t[2] Спрос:')
    if save_empty_excel(df_1, path, file_name, 'Спрос'):
        print(f'\t\t\t[!] df пуст, файл сохранен.')
        df_2 = df_1
    else:
        df_1[f'stage_2'] = pd.DataFrame(model_2.predict_proba(df_1['text_lemm']))[0]
        df_1[f'stage_2_b'] = [1 if df_1[f'stage_2'][item] * 100 < 50.0 else 0 for item in range(len(df_1[f'stage_2']))]
        df_s = df_1.loc[df_1[f'stage_2_b'] == 1]
        print(f'\t\t\t[?] Найдено {len(df_s)} статей по теме')

        labels_s = []
        labels_s_cbr = []
        scores_s = []
        scores_s_cbr = []
        keywords_s = []
        print('\t\t\t[!] Начинаем определение тональности и выделение ключевых слов:')
        for text in tqdm(df_s['text_prep']):
            text_label = get_sentiment(M=model, T=tokenizer, text=text, return_type='label')
            text_label_cbr = get_sentiment(M=model_cbr, T=tokenizer_cbr, text=text, return_type='label')
            text_score = get_sentiment(M=model, T=tokenizer, text=text, return_type='score')
            text_score_cbr = get_sentiment(M=model_cbr, T=tokenizer_cbr, text=text, return_type='score')
            text_keywords = term_extractor(text, limit=10, strings=True)
            labels_s.append(text_label)
            labels_s_cbr.append(text_label_cbr)
            scores_s.append(text_score)
            scores_s_cbr.append(text_score_cbr)
            keywords_s.append(text_keywords)
        df_s['Тональность'] = labels_s
        df_s['Тональность_cbr'] = labels_s_cbr
        df_s['Оценка тональности'] = scores_s
        df_s['Оценка тональности_cbr'] = scores_s_cbr
        df_s['Ключевые слова'] = keywords_s

        print('\t\t\t[!] Сохраняем Excel')
        df_s.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Спрос.xlsx', index=False)
        df_2 = df_1.loc[df_1[f'stage_2_b'] == 0]
        df_2 = df_2.drop([f'stage_2', f'stage_2_b'], axis=1).reset_index(drop=True)

    print(f'\t\t[3-4] Кредитный/депозитный рынок:')
    if save_empty_excel(df_2, path, file_name, 'Кредитный_депозитный рынок'):
        print(f'\t\t\t[!] df пуст, файл сохранен.')
        df_3_4 = df_2
    else:
        df_2[f'stage_3-4'] = pd.DataFrame(model_3_4.predict_proba(df_2['text_stem']))[0]
        df_2[f'stage_3-4_b'] = [1 if df_2[f'stage_3-4'][item] * 100 < 50.0 else 0 for item in range(len(df_2[f'stage_3-4']))]
        df_kd = df_2.loc[df_2[f'stage_3-4_b'] == 1]
        print(f'\t\t\t[?] Найдено {len(df_kd)} статей по теме')

        labels_kd = []
        scores_kd = []
        keywords_kd = []
        print('\t\t\t[!] Начинаем определение тональности и выделение ключевых слов:')
        for text in tqdm(df_kd['text_prep']):
            text_label = get_sentiment(M=model, T=tokenizer, text=text, return_type='label')
            text_score = get_sentiment(M=model, T=tokenizer, text=text, return_type='score')
            text_keywords = term_extractor(text, limit=10, strings=True)
            labels_kd.append(text_label)
            scores_kd.append(text_score)
            keywords_kd.append(text_keywords)
        df_kd['Тональность'] = labels_kd
        df_kd['Оценка тональности'] = scores_kd
        df_kd['Ключевые слова'] = keywords_kd

        print('\t\t\t[!] Сохраняем Excel')
        df_kd.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Кредитный_депозитный рынок.xlsx', index=False)
        df_3_4 = df_2.loc[df_2[f'stage_3-4_b'] == 0]
        df_3_4 = df_3_4.drop([f'stage_3-4', f'stage_3-4_b'], axis=1).reset_index(drop=True)

    print(f'\t\t[5] Инфляция:')
    if save_empty_excel(df_3_4, path, file_name, 'Инфляция'):
        print(f'\t\t\t[!] df пуст, файл сохранен.')
        df_5 = df_3_4
    else:
        df_3_4[f'stage_5'] = pd.DataFrame(model_5.predict_proba(df_3_4['text_lemm']))[0]
        df_3_4[f'stage_5_b'] = [1 if df_3_4[f'stage_5'][item] * 100 < 50.0 else 0 for item in range(len(df_3_4[f'stage_5']))]
        df_i = df_3_4.loc[df_3_4[f'stage_5_b'] == 1]
        print(f'\t\t\t[?] Найдено {len(df_i)} статей по теме')

        labels_i = []
        scores_i = []
        keywords_i = []
        print('\t\t\t[!] Начинаем определение тональности и выделение ключевых слов:')
        for text in tqdm(df_i['text_prep']):
            text_label = get_sentiment(M=model, T=tokenizer, text=text, return_type='label')
            text_score = get_sentiment(M=model, T=tokenizer, text=text, return_type='score')
            text_keywords = term_extractor(text, limit=10, strings=True)
            labels_i.append(text_label)
            scores_i.append(text_score)
            keywords_i.append(text_keywords)
        df_i['Тональность'] = labels_i
        df_i['Оценка тональности'] = scores_i
        df_i['Ключевые слова'] = keywords_i

        print('\t\t\t[!] Сохраняем Excel')
        df_i.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Инфляция.xlsx', index=False)
        df_5 = df_3_4.loc[df_3_4[f'stage_5_b'] == 0]
        df_5 = df_5.drop([f'stage_5', f'stage_5_b'], axis=1).reset_index(drop=True)

    print(f'\t\t[6] Производство:')
    if save_empty_excel(df_5, path, file_name, 'Производство'):
        print(f'\t\t\t[!] df пуст, файл сохранен.')
        df_6 = df_5
    else:
        df_5[f'stage_6'] = pd.DataFrame(model_6.predict_proba(df_5['text_lemm']))[0]
        df_5[f'stage_6_b'] = [1 if df_5[f'stage_6'][item] * 100 < 50.0 else 0 for item in range(len(df_5[f'stage_6']))]
        df_pr = df_5.loc[df_5[f'stage_6_b'] == 1]
        print(f'\t\t\t[?] Найдено {len(df_pr)} статей по теме')

        labels_pr = []
        scores_pr = []
        keywords_pr = []
        print('\t\t\t[!] Начинаем определение тональности и выделение ключевых слов:')
        for text in tqdm(df_pr['text_prep']):
            text_label = get_sentiment(M=model, T=tokenizer, text=text, return_type='label')
            text_score = get_sentiment(M=model, T=tokenizer, text=text, return_type='score')
            text_keywords = term_extractor(text, limit=10, strings=True)
            labels_pr.append(text_label)
            scores_pr.append(text_score)
            keywords_pr.append(text_keywords)
        df_pr['Тональность'] = labels_pr
        df_pr['Оценка тональности'] = scores_pr
        df_pr['Ключевые слова'] = keywords_pr

        print('\t\t\t[!] Сохраняем Excel')
        df_pr.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Производство.xlsx', index=False)
        df_6 = df_5.loc[df_5[f'stage_6_b'] == 0]
        df_6 = df_6.drop([f'stage_6', f'stage_6_b'], axis=1).reset_index(drop=True)

    print(f'\t\t[7] Кап. вложения:')
    if save_empty_excel(df_6, path, file_name, 'Кап_вложения'):
        print(f'\t\t\t[!] df пуст, файл сохранен.')
        df_7 = df_6
    else:
        df_6[f'stage_7'] = pd.DataFrame(model_7.predict_proba(df_6['text_lemm']))[0]
        df_6[f'stage_7_b'] = [1 if df_6[f'stage_7'][item] * 100 < 50.0 else 0 for item in range(len(df_6[f'stage_7']))]
        df_kv = df_6.loc[df_6[f'stage_7_b'] == 1]
        print(f'\t\t\t[?] Найдено {len(df_kv)} статей по теме')

        labels_kv = []
        scores_kv = []
        keywords_kv = []
        print('\t\t\t[!] Начинаем определение тональности и выделение ключевых слов:')
        for text in tqdm(df_kv['text_prep']):
            text_label = get_sentiment(M=model, T=tokenizer, text=text, return_type='label')
            text_score = get_sentiment(M=model, T=tokenizer, text=text, return_type='score')
            text_keywords = term_extractor(text, limit=10, strings=True)
            labels_kv.append(text_label)
            scores_kv.append(text_score)
            keywords_kv.append(text_keywords)
        df_kv['Тональность'] = labels_kv
        df_kv['Оценка тональности'] = scores_kv
        df_kv['Ключевые слова'] = keywords_kv

        print('\t\t\t[!] Сохраняем Excel')
        df_kv.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Кап. вложения.xlsx', index=False)
        df_7 = df_6.loc[df_6[f'stage_7_b'] == 0]
        df_7 = df_7.drop([f'stage_7', f'stage_7_b'], axis=1).reset_index(drop=True)

    df_7.to_excel(f'{path}\После предобработки/{str(file_name).split(".")[0]}/{str(file_name).split(".")[0]}_Прочее_2.xlsx', index=False)

    print('[-] Удаляем исходный файл')
    os.remove(f'{path}До предобработки/{str(file_name)}')

[+] Владимирская область_2024M7.xlsx
	[!] Удаляем дубликаты и пустоты:
		[-] До удаления: 11073
		[-] После удаления: 11073
	[!] Начинаем стэмминг:


100%|██████████| 11073/11073 [01:42<00:00, 107.90it/s]


	[!] Удаление коротких статей:
		[?] Удалено статей: 46
	[!] Начинаем удалять стоп-слова:


100%|██████████| 11027/11027 [00:15<00:00, 721.38it/s]


	[!] Начинаем лемматизацию:


 19%|█▉        | 2146/11027 [39:34<2:43:44,  1.11s/it]


KeyboardInterrupt: 