In [None]:
! pip install SpeechRecognition
! pip install pydub
! pip install google-cloud-speech

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import speech_recognition as sr 
import os 
from pydub import AudioSegment
from pydub.silence import split_on_silence

# create a speech recognition object
r = sr.Recognizer()

# a function that splits the audio file into chunks
# and applies speech recognition
def get_large_audio_transcription(path):
    """
    Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks
    """
    # open the audio file using pydub
    sound = AudioSegment.from_wav(path)  
    # split audio sound where silence is 700 miliseconds or more and get chunks
    chunks = split_on_silence(sound,
        # experiment with this value for your target audio file
        min_silence_len = 500,
        # adjust this per requirement
        silence_thresh = sound.dBFS-14,
        # keep the silence for 1 second, adjustable as well
        keep_silence=500,
    )
    folder_name = "audio-chunks"
    # create a directory to store the audio chunks
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)
    whole_text = ""
    # process each chunk 
    for i, audio_chunk in enumerate(chunks, start=1):
        # export audio chunk and save it in
        # the `folder_name` directory.
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        # recognize the chunk
        with sr.AudioFile(chunk_filename) as source:
            audio_listened = r.record(source)
            # try converting it to text
            try:
                text = r.recognize_google(audio_listened)
            except sr.UnknownValueError as e:
                print("Error:", str(e))
            else:
                text = f"{text.capitalize()}. "
                print(chunk_filename, ":", text)
                whole_text += text
    # return the text for all chunks detected
    return whole_text

In [None]:
path = "final.wav"

text = get_large_audio_transcription(path);
print("\nFull text:", text)

audio-chunks/chunk1.wav : Section 49 at the art of public speaking this is a librivox recording. 
audio-chunks/chunk2.wav : Only provoked recording from the public tonight. 
audio-chunks/chunk3.wav : More information or to volunteer. 
audio-chunks/chunk4.wav : Please present librivox.org. 
audio-chunks/chunk5.wav : Recording by paul adams. 
audio-chunks/chunk6.wav : The office of public speaking by dale carnegie and joseph bookcase and wine. 
audio-chunks/chunk7.wav : Section 49. 
audio-chunks/chunk8.wav : Appendix d. 
audio-chunks/chunk9.wav : Speeches for study and practice. 
audio-chunks/chunk10.wav : Victor hugo honore de balzac. 
audio-chunks/chunk11.wav : Delivered at the funeral of balance at august 20th. 
audio-chunks/chunk12.wav : 1850. 
audio-chunks/chunk13.wav : Gentleman. 
audio-chunks/chunk14.wav : The man who now goes down into this tube is one of those to whom public queef pays homage. 
audio-chunks/chunk15.wav : In one day all fictions have vanished. 
audio-chunks/chunk

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import re
import nltk


In [None]:
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
article_text = text

In [None]:
# Removing Square Brackets and Extra Spaces
# article_text = re.sub(r'\[[a-zA-Z]*\]', ' ', article_text)
# article_text = re.sub(r'\s+', ' ', article_text)

In [None]:
# Removing special characters and digits
# formatted_article_text = article_text 
# formatted_article_text = re.sub('[0-9]', ' ', article_text )
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )
# formatted_article_text = re.sub('', ' ', formatted_article_text )

formatted_article_text = re.sub(r'\s+',' ', formatted_article_text)

In [None]:
formatted_article_text

'Section at the art of public speaking this is a librivox recording Only provoked recording from the public tonight More information or to volunteer Please present librivox org Recording by paul adams The office of public speaking by dale carnegie and joseph bookcase and wine Section Appendix d Speeches for study and practice Victor hugo honore de balzac Delivered at the funeral of balance at august th Gentleman The man who now goes down into this tube is one of those to whom public queef pays homage In one day all fictions have vanished Bianca is fixed not only on the heads that right Put on the headset think The whole country is moved when one of those heads disappears Today we have a people in black because of the death of the man of talent Nation in mourning for a man of genius Gentleman The name of bowser will be mingled in the luminous trace epochal leave across the future Bounce with one powerful generation of rice s of the nineteenth century who came often the polian As the ind

In [None]:
sentence_list = nltk.sent_tokenize(formatted_article_text)

In [None]:
sentence_list

['Section at the art of public speaking this is a librivox recording Only provoked recording from the public tonight More information or to volunteer Please present librivox org Recording by paul adams The office of public speaking by dale carnegie and joseph bookcase and wine Section Appendix d Speeches for study and practice Victor hugo honore de balzac Delivered at the funeral of balance at august th Gentleman The man who now goes down into this tube is one of those to whom public queef pays homage In one day all fictions have vanished Bianca is fixed not only on the heads that right Put on the headset think The whole country is moved when one of those heads disappears Today we have a people in black because of the death of the man of talent Nation in mourning for a man of genius Gentleman The name of bowser will be mingled in the luminous trace epochal leave across the future Bounce with one powerful generation of rice s of the nineteenth century who came often the polian As the in

In [None]:
sentence_lower = formatted_article_text.lower()
sentence_lower

'section at the art of public speaking this is a librivox recording only provoked recording from the public tonight more information or to volunteer please present librivox org recording by paul adams the office of public speaking by dale carnegie and joseph bookcase and wine section appendix d speeches for study and practice victor hugo honore de balzac delivered at the funeral of balance at august th gentleman the man who now goes down into this tube is one of those to whom public queef pays homage in one day all fictions have vanished bianca is fixed not only on the heads that right put on the headset think the whole country is moved when one of those heads disappears today we have a people in black because of the death of the man of talent nation in mourning for a man of genius gentleman the name of bowser will be mingled in the luminous trace epochal leave across the future bounce with one powerful generation of rice s of the nineteenth century who came often the polian as the ind

In [None]:
sentence_list = sentence_lower.split()
sentence_list

['section',
 'at',
 'the',
 'art',
 'of',
 'public',
 'speaking',
 'this',
 'is',
 'a',
 'librivox',
 'recording',
 'only',
 'provoked',
 'recording',
 'from',
 'the',
 'public',
 'tonight',
 'more',
 'information',
 'or',
 'to',
 'volunteer',
 'please',
 'present',
 'librivox',
 'org',
 'recording',
 'by',
 'paul',
 'adams',
 'the',
 'office',
 'of',
 'public',
 'speaking',
 'by',
 'dale',
 'carnegie',
 'and',
 'joseph',
 'bookcase',
 'and',
 'wine',
 'section',
 'appendix',
 'd',
 'speeches',
 'for',
 'study',
 'and',
 'practice',
 'victor',
 'hugo',
 'honore',
 'de',
 'balzac',
 'delivered',
 'at',
 'the',
 'funeral',
 'of',
 'balance',
 'at',
 'august',
 'th',
 'gentleman',
 'the',
 'man',
 'who',
 'now',
 'goes',
 'down',
 'into',
 'this',
 'tube',
 'is',
 'one',
 'of',
 'those',
 'to',
 'whom',
 'public',
 'queef',
 'pays',
 'homage',
 'in',
 'one',
 'day',
 'all',
 'fictions',
 'have',
 'vanished',
 'bianca',
 'is',
 'fixed',
 'not',
 'only',
 'on',
 'the',
 'heads',
 'that',
 '

In [None]:
unique = []
for word in sentence_list:
  if word not in unique:
      unique.append(word)

#sort
unique.sort()

#print
print(unique)

['a', 'about', 'above', 'abyss', 'accept', 'accomplished', 'across', 'act', 'adams', 'after', 'again', 'ago', 'all', 'among', 'amount', 'an', 'and', 'another', 'answerable', 'appendix', 'are', 'art', 'as', 'at', 'august', 'author', 'back', 'balance', 'balzac', 'be', 'beads', 'because', 'belzak', 'best', 'beyond', 'bianca', 'black', 'body', 'book', 'bookcase', 'books', 'bounce', 'bowser', 'brain', 'but', 'by', 'called', 'came', 'can', 'carnegie', 'catastrophes', 'catch', 'century', 'civilization', 'clearly', 'clothes', 'clouds', 'come', 'comedy', 'coming', 'commonplace', 'communicate', 'comprehend', 'congress', 'conquerors', 'consented', 'constituted', 'contemporary', 'contention', 'could', 'country', 'd', 'dale', 'day', 'days', 'de', 'dead', 'death', 'delivered', 'destiny', 'development', 'devoured', 'disappears', 'dissected', 'distant', 'distressing', 'does', 'down', 'dying', 'each', 'embrace', 'enters', 'entitled', 'envy', 'epochal', 'escapes', 'esoteric', 'ever', 'face', 'fatigued',

In [None]:
number_of_words = dict.fromkeys(unique, 0)
for word in sentence_list:
  number_of_words[word] +=1
number_of_words

{'a': 15,
 'about': 1,
 'above': 1,
 'abyss': 1,
 'accept': 2,
 'accomplished': 1,
 'across': 1,
 'act': 1,
 'adams': 1,
 'after': 1,
 'again': 1,
 'ago': 1,
 'all': 5,
 'among': 5,
 'amount': 1,
 'an': 1,
 'and': 25,
 'another': 1,
 'answerable': 1,
 'appendix': 1,
 'are': 3,
 'art': 1,
 'as': 3,
 'at': 4,
 'august': 1,
 'author': 1,
 'back': 2,
 'balance': 2,
 'balzac': 1,
 'be': 2,
 'beads': 1,
 'because': 1,
 'belzak': 1,
 'best': 1,
 'beyond': 1,
 'bianca': 1,
 'black': 1,
 'body': 2,
 'book': 4,
 'bookcase': 1,
 'books': 1,
 'bounce': 2,
 'bowser': 1,
 'brain': 1,
 'but': 1,
 'by': 6,
 'called': 1,
 'came': 2,
 'can': 1,
 'carnegie': 1,
 'catastrophes': 1,
 'catch': 1,
 'century': 2,
 'civilization': 2,
 'clearly': 1,
 'clothes': 1,
 'clouds': 1,
 'come': 2,
 'comedy': 1,
 'coming': 1,
 'commonplace': 1,
 'communicate': 1,
 'comprehend': 1,
 'congress': 1,
 'conquerors': 1,
 'consented': 1,
 'constituted': 1,
 'contemporary': 1,
 'contention': 1,
 'could': 1,
 'country': 3,
 'd':

In [None]:
import pandas as pd
df = pd.DataFrame(number_of_words, index=[0])
df

Unnamed: 0,a,about,above,abyss,accept,accomplished,across,act,adams,after,...,wished,with,within,words,work,worker,would,wrecked,you,ziff
0,15,1,1,1,2,1,1,1,1,1,...,1,7,1,1,2,1,1,1,3,1


In [None]:
#calculating TF
def TF(word_dict, list_of_words):
  tf_dict = {}
  corpusCount = len(list_of_words)
  for word, count in word_dict.items():
    tf_dict[word] = count / float(corpusCount)
  return tf_dict

In [None]:
tf = TF(number_of_words, sentence_list)
tf

{'a': 0.019305019305019305,
 'about': 0.001287001287001287,
 'above': 0.001287001287001287,
 'abyss': 0.001287001287001287,
 'accept': 0.002574002574002574,
 'accomplished': 0.001287001287001287,
 'across': 0.001287001287001287,
 'act': 0.001287001287001287,
 'adams': 0.001287001287001287,
 'after': 0.001287001287001287,
 'again': 0.001287001287001287,
 'ago': 0.001287001287001287,
 'all': 0.006435006435006435,
 'among': 0.006435006435006435,
 'amount': 0.001287001287001287,
 'an': 0.001287001287001287,
 'and': 0.032175032175032175,
 'another': 0.001287001287001287,
 'answerable': 0.001287001287001287,
 'appendix': 0.001287001287001287,
 'are': 0.003861003861003861,
 'art': 0.001287001287001287,
 'as': 0.003861003861003861,
 'at': 0.005148005148005148,
 'august': 0.001287001287001287,
 'author': 0.001287001287001287,
 'back': 0.002574002574002574,
 'balance': 0.002574002574002574,
 'balzac': 0.001287001287001287,
 'be': 0.002574002574002574,
 'beads': 0.001287001287001287,
 'because': 

In [None]:
df1 = pd.DataFrame(tf, index=[0])
df1

Unnamed: 0,a,about,above,abyss,accept,accomplished,across,act,adams,after,...,wished,with,within,words,work,worker,would,wrecked,you,ziff
0,0.019305,0.001287,0.001287,0.001287,0.002574,0.001287,0.001287,0.001287,0.001287,0.001287,...,0.001287,0.009009,0.001287,0.001287,0.002574,0.001287,0.001287,0.001287,0.003861,0.001287


In [None]:
stopwords = nltk.corpus.stopwords.words('english')

word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1
print(word_frequencies)

{'Section': 2, 'art': 1, 'public': 4, 'speaking': 2, 'librivox': 2, 'recording': 2, 'Only': 2, 'provoked': 1, 'tonight': 1, 'More': 2, 'information': 1, 'volunteer': 1, 'Please': 1, 'present': 1, 'org': 1, 'Recording': 1, 'paul': 1, 'adams': 1, 'The': 7, 'office': 1, 'dale': 1, 'carnegie': 1, 'joseph': 1, 'bookcase': 1, 'wine': 1, 'Appendix': 1, 'Speeches': 1, 'study': 1, 'practice': 1, 'Victor': 1, 'hugo': 1, 'honore': 1, 'de': 1, 'balzac': 1, 'Delivered': 1, 'funeral': 1, 'balance': 1, 'august': 1, 'th': 3, 'Gentleman': 2, 'man': 4, 'goes': 2, 'tube': 2, 'one': 10, 'queef': 1, 'pays': 1, 'homage': 1, 'In': 2, 'day': 2, 'fictions': 1, 'vanished': 1, 'Bianca': 1, 'fixed': 1, 'heads': 3, 'right': 1, 'Put': 1, 'headset': 1, 'think': 1, 'whole': 1, 'country': 3, 'moved': 1, 'disappears': 1, 'Today': 2, 'people': 2, 'black': 1, 'death': 2, 'talent': 1, 'Nation': 1, 'mourning': 1, 'genius': 2, 'name': 1, 'bowser': 1, 'mingled': 1, 'luminous': 2, 'trace': 1, 'epochal': 1, 'leave': 1, 'across

In [None]:
#calculating IDF
def IDF(documents):
  import math
  N = len(documents)
  idf_dict = dict.fromkeys(documents[0].keys(), 0)
  for document in documents:
    for word, val in document.items():
      if val>0:
        idf_dict[word] += 1
  for word, val in idf_dict.items():
    idf_dict[word] = math.log(N / float(val)+1)
  return idf_dict


In [None]:
Idf = IDF([number_of_words])
Idf

{'a': 0.6931471805599453,
 'about': 0.6931471805599453,
 'above': 0.6931471805599453,
 'abyss': 0.6931471805599453,
 'accept': 0.6931471805599453,
 'accomplished': 0.6931471805599453,
 'across': 0.6931471805599453,
 'act': 0.6931471805599453,
 'adams': 0.6931471805599453,
 'after': 0.6931471805599453,
 'again': 0.6931471805599453,
 'ago': 0.6931471805599453,
 'all': 0.6931471805599453,
 'among': 0.6931471805599453,
 'amount': 0.6931471805599453,
 'an': 0.6931471805599453,
 'and': 0.6931471805599453,
 'another': 0.6931471805599453,
 'answerable': 0.6931471805599453,
 'appendix': 0.6931471805599453,
 'are': 0.6931471805599453,
 'art': 0.6931471805599453,
 'as': 0.6931471805599453,
 'at': 0.6931471805599453,
 'august': 0.6931471805599453,
 'author': 0.6931471805599453,
 'back': 0.6931471805599453,
 'balance': 0.6931471805599453,
 'balzac': 0.6931471805599453,
 'be': 0.6931471805599453,
 'beads': 0.6931471805599453,
 'because': 0.6931471805599453,
 'belzak': 0.6931471805599453,
 'best': 0.

In [None]:
tfidf = (df1/Idf)

In [None]:
tfidf

Unnamed: 0,a,about,above,abyss,accept,accomplished,across,act,adams,after,...,wished,with,within,words,work,worker,would,wrecked,you,ziff
0,0.027851,0.001857,0.001857,0.001857,0.003714,0.001857,0.001857,0.001857,0.001857,0.001857,...,0.001857,0.012997,0.001857,0.001857,0.003714,0.001857,0.001857,0.001857,0.00557,0.001857
