Converting Audio to Text

In [1]:
!pip install openai-whisper pydub noisereduce librosa soundfile ffmpeg
!apt-get install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [2]:
import os
import whisper
from pydub import AudioSegment
import noisereduce as nr
import librosa
import numpy as np
import soundfile as sf

In [3]:
input_folder = "all_audio_start" #Папка с аудиофайлами
output_folder = "transcriptions" #Папка с txt

In [4]:
#Создаем папку для текстовых файлов, если её нет
os.makedirs(output_folder, exist_ok=True)

In [5]:
#Функция для конвертации MP3 → WAV с шумоподавлением
def preprocess_audio(input_mp3, output_wav):
    """
    Конвертирует MP3 в WAV (моно, 16kHz) и убирает шум.
    """
    #Конвертация MP3 → WAV (моно, 16 кГц)
    audio = AudioSegment.from_mp3(input_mp3)
    audio = audio.set_channels(1).set_frame_rate(16000)
    audio.export(output_wav, format="wav")

    #Уменьшение шума
    y, sr = librosa.load(output_wav, sr=16000)
    reduced_noise = nr.reduce_noise(y=y, sr=sr)
    sf.write(output_wav, reduced_noise, sr)

In [6]:
#Загружаем Whisper (GPU)
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [01:52<00:00, 27.4MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [7]:
for filename in os.listdir(input_folder):
    if filename.endswith(".mp3"):
        input_mp3 = os.path.join(input_folder, filename)
        output_wav = input_mp3.replace(".mp3", ".wav")  # Генерируем WAV-файл
        output_txt = os.path.join(output_folder, filename.replace(".mp3", ".txt"))

        print(f"🔄 Обрабатываем {filename}...")

        # Конвертация MP3 → WAV с шумоподавлением
        preprocess_audio(input_mp3, output_wav)

        # Распознаем текст
        result = model.transcribe(
            output_wav,
            language="ru",
            condition_on_previous_text=False  # Улучшает точность
        )

        # Сохраняем текст в соответствующий TXT-файл
        with open(output_txt, "w", encoding="utf-8") as f:
            f.write(result["text"])

        print(f"Готово: {output_txt}")

print("\nВсе аудиофайлы обработаны и сохранены в папке 'transcriptions'!")

🔄 Обрабатываем Audio12.mp3...
Готово: transcriptions/Audio12.txt
🔄 Обрабатываем Audio15.mp3...
Готово: transcriptions/Audio15.txt
🔄 Обрабатываем Audio7.mp3...
Готово: transcriptions/Audio7.txt
🔄 Обрабатываем Audio19.mp3...
Готово: transcriptions/Audio19.txt
🔄 Обрабатываем Audio14.mp3...
Готово: transcriptions/Audio14.txt
🔄 Обрабатываем Audio5.mp3...
Готово: transcriptions/Audio5.txt
🔄 Обрабатываем Audio6.mp3...
Готово: transcriptions/Audio6.txt
🔄 Обрабатываем Audio11.mp3...
Готово: transcriptions/Audio11.txt
🔄 Обрабатываем Audio16.mp3...
Готово: transcriptions/Audio16.txt
🔄 Обрабатываем Audio2.mp3...
Готово: transcriptions/Audio2.txt
🔄 Обрабатываем Audio10.mp3...
Готово: transcriptions/Audio10.txt
🔄 Обрабатываем Audio4.mp3...
Готово: transcriptions/Audio4.txt
🔄 Обрабатываем Audio17.mp3...
Готово: transcriptions/Audio17.txt
🔄 Обрабатываем Audio8.mp3...
Готово: transcriptions/Audio8.txt
🔄 Обрабатываем Audio20.mp3...
Готово: transcriptions/Audio20.txt
🔄 Обрабатываем Audio13.mp3...
Готово:

NE UDACHNYI PO MOEMU MNENIYU

Ispolzvanie BERT, klastorizaciya teksta

In [1]:
!pip install sentence-transformers sklearn nltk

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [2]:
import os
import string
import nltk
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

In [9]:
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
#Папки с текстами
transcriptions_folder = "all_text_start"
output_results = "deal_clusters.txt"

In [5]:
#Загружаем стоп-слова из файла
stopwords_path = "stopwords-ru.txt"
if os.path.exists(stopwords_path):
    with open(stopwords_path, "r", encoding="utf-8") as f:
        stop_words = set(f.read().splitlines())

In [6]:
#Функция очистки текста
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return " ".join(words)

In [7]:
#Загружаем модель RuBERT (CPU)
model = SentenceTransformer("DeepPavlov/rubert-base-cased-sentence", device="cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
#Читаем все тексты
texts = []
file_names = []

for filename in os.listdir(transcriptions_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(transcriptions_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            raw_text = f.read()
            cleaned_text = clean_text(raw_text)
            texts.append(cleaned_text)
            file_names.append(filename)

In [11]:
#Создаем эмбеддинги для текстов
embeddings = model.encode(texts, device="cpu")

In [12]:
#Определяем количество кластеров
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(embeddings)

In [13]:
#Преобразуем номера кластеров в текстовые метки
cluster_labels = {
    0: "Deal",
    1: "Not Deal",
    2: "Unknown"
}

In [14]:
#Сохраняем результаты
with open(output_results, "w", encoding="utf-8") as result_file:
    for i, filename in enumerate(file_names):
        label = cluster_labels[clusters[i]]
        result_file.write(f"{filename}: {label}\n")

VRODE KAK NORM

Razdelenie na Positive; Negative

In [3]:
!pip install transformers sentencepiece torch



In [4]:
import os
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [25]:
#Загружаем модель и токенизатор
model_name = "blanchefort/rubert-base-cased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [26]:
#CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [27]:
#Папка с текстовыми файлами
transcriptions_folder = "all_text_start"
output_results = "deal_sentiment_results.txt"

In [28]:
labels = ["Negative", "Neutral", "Positive"]

In [29]:
#Открываем файл для записи результатов
with open(output_results, "w", encoding="utf-8") as result_file:
    for filename in os.listdir(transcriptions_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(transcriptions_folder, filename)

            # Читаем текст из файла
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()

            #Токенизируем текст и передаем в модель
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Отключаем градиенты (ускоряет работу)
            with torch.no_grad():
                outputs = model(**inputs)

            #Получаем предсказания
            scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
            sentiment_idx = torch.argmax(scores).item()
            sentiment_label = labels[sentiment_idx]
            sentiment_score = scores[sentiment_idx].item() * 100

            #Разделяем только на категории
            if sentiment_label == "Positive":
                deal_status = f"Positive ({sentiment_score:.2f}%)"
            else:
                deal_status = f"Negative ({sentiment_score:.2f}%)"

            #Записываем результат
            output = f"{filename}: {deal_status}"
            result_file.write(output + "\n")

Sohranayu lokalno

In [5]:
#Загружаем и сохраняем Whisper
model = whisper.load_model("large")
torch.save(model.state_dict(), "whisper_model.pth")

  checkpoint = torch.load(fp, map_location=device)


In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

model_name = "blanchefort/rubert-base-cased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

torch.save(model.state_dict(), "rubert_model.pth")
tokenizer.save_pretrained("rubert_tokenizer")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

('rubert_tokenizer/tokenizer_config.json',
 'rubert_tokenizer/special_tokens_map.json',
 'rubert_tokenizer/vocab.txt',
 'rubert_tokenizer/added_tokens.json',
 'rubert_tokenizer/tokenizer.json')

In [7]:
from google.colab import files

!zip -r rubert_tokenizer.zip rubert_tokenizer
files.download("rubert_tokenizer.zip")

  adding: rubert_tokenizer/ (stored 0%)
  adding: rubert_tokenizer/vocab.txt (deflated 65%)
  adding: rubert_tokenizer/special_tokens_map.json (deflated 42%)
  adding: rubert_tokenizer/tokenizer.json (deflated 73%)
  adding: rubert_tokenizer/tokenizer_config.json (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>