In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
filelist=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filelist.append(os.path.join(dirname, filename))
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# install the required libraries
!pip install SpeechRecognition
!pip install contractions

In [4]:
# import the libraries 
import IPython.display as ipd
import speech_recognition as sr
import textblob
import random

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models

import gensim
import nltk
import contractions

import spacy
from spacy import displacy

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [5]:
# to play an audio
audio1 = '/kaggle/input/voicetest/en-0527.wav'
ipd.Audio(audio1)

In [6]:
# to convert the speech to text and classify sentiment
text_set=[]
senti_set=[]
recording = sr.Recognizer()
for filenameop in filelist:
    with sr.AudioFile(filenameop) as source:
        audio_data = recording.record(source)
        text = recording.recognize_google(audio_data)
        text = contractions.fix(text)
        #print(text)
        text_set.append(text)
        tb = textblob.TextBlob(text, analyzer=textblob.sentiments.NaiveBayesAnalyzer())
        senti = tb.sentiment 
        senti_set.append(senti)
        print("\nText:", text) #, "Classify:",tb.classify())
        print(senti)

In [7]:
# to perform the topic analysis
texts=[]
all_tokens=[]
all_stop_tokens=[]

tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
wnl = nltk.stem.wordnet.WordNetLemmatizer()

for i in text_set:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    
    stopped_tokens = [i for i in tokens if not i in en_stop]
    lemma_tokens = [wnl.lemmatize(i) for i in stopped_tokens]
    all_tokens.append(tokens)
    all_stop_tokens.append(stopped_tokens)
    texts.append(lemma_tokens)
    
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(txt) for txt in texts]

print('Tokens:',all_tokens,sep='\n')
print('\nStop_tokens:',all_stop_tokens,sep='\n')
print('\nLemma_tokens:',texts,sep='\n')
print('\nDictionary:',dictionary,sep='\n')
print('\nCorpus:',corpus,sep='\n')

lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20)
print('\nTopics:',5,"Words:",5)
display(lda_model.print_topics(num_topics=5, num_words=5))

In [8]:
# visualize the topic analysis
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary,mds='mmds')
pyLDAvis.display(vis)

In [9]:
# to perform the entity analysis
NER = spacy.load("en_core_web_sm")
for t in text_set:
    raw_txt = t #text_set[4]
    ner_txt = NER(raw_txt)
    print([(word, word.ent_type_) for word in ner_txt if word.ent_type_])

In [10]:
named_entities = []
for sentence in text_set:
    temp_entity_name = ''
    temp_named_entity = None
    sentence = NER(sentence)
    for word in sentence:
        term = word.text 
        tag = word.ent_type_
        if tag:
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        #else:
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            temp_entity_name = ''
            temp_named_entity = None
entity_frame = pd.DataFrame(named_entities, 
                            columns=['Entity Name', 'Entity Type'])
display(entity_frame)

In [11]:
# get the top named entities
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

In [12]:
# get the top named entity types
top_entities = (entity_frame.groupby(by=['Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]