## Importing libraries

In [61]:
from gensim import corpora, models
from gensim.models import ldamodel, CoherenceModel
import gensim
import nltk
from nltk.stem import WordNetLemmatizer
import pandas as pd
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

from sklearn.metrics.pairwise import cosine_similarity
import spacy
from textblob import TextBlob

## Reading file

In [3]:
filename=r'..\DatasetEnglish.csv'
data=pd.read_csv(filename)
data=data.drop(['index'], axis=1)
data.head()

Unnamed: 0,mongodb_id,has_preview,shortened_url,comments_enabled,comments_logged_in_only,talk_id,talk_slug,talk_title,talk_social_title,speakers_name,...,related_talk_5_id,speakers_typename,speakers_firstname,speakers_lastname,speakers_description,speakers_is_live,speakers_who_they_are,speakers_why_listen,external_service,external_service_code
0,652af294d95841780141bab7,False,https://go.ted.com/6Ryx,False,False,2147,aakash_odedra_a_dance_in_a_hurricane_of_paper_...,"A dance in a hurricane of paper, wind and light","A dance in a hurricane of paper, wind and light",Aakash Odedra,...,2273.0,AcmeSpeaker,Aakash,Odedra,Choreographer,True,Aakash Odedra sets raw ancient dance forms fro...,Based in the UK with a growing international r...,YouTube,T49IjKho5y8
1,652af54dd95841780141bab8,False,https://go.ted.com/6sZX,False,False,2683,aala_el_khani_what_it_s_like_to_be_a_parent_in...,What it's like to be a parent in a war zone,What it's like to be a parent in a war zone,Aala El-Khani,...,36063.0,AcmeSpeaker,Aala,El-Khani,Humanitarian psychologist,True,Aala El-Khani explores the needs of families a...,<p>Dr. Aala El-Khani researches and develops i...,YouTube,dY9f9bFctUE
2,652af54dd95841780141bab9,False,https://go.ted.com/6yKv,False,False,91525,aarathi_krishnan_5_ethical_principles_for_digi...,5 ethical principles for digitizing humanitari...,,Aarathi Krishnan,...,24354.0,AcmeSpeaker,Aarathi,Krishnan,Tech and human rights ethicist,True,Aarathi Krishnan designs institutions and syst...,<p>Aarathi Krishnan works at the intersections...,YouTube,Ix8Cz-veat4
3,652af54dd95841780141baba,False,https://go.ted.com/6RgH,True,True,101504,aaron_bastani_a_socialist_perspective_on_the_p...,A socialist perspective on the pursuit of happ...,,Aaron Bastani,...,972.0,AcmeSpeaker,Aaron,Bastani,Journalist,True,Aaron Bastani covers the issues that will defi...,<div>Aaron Bastani is the cofounder of Novara ...,YouTube,M6aq2SH-xVo
4,652af54dd95841780141babb,False,https://go.ted.com/6JLM,False,False,14610,aaron_duffy_lake_buckley_and_jack_foster_illus...,"""Illusions for a better society""","""Illusions for a better society""","Aaron Duffy, Lake Buckley and Jack Foster",...,,AcmeSpeaker,Aaron,Duffy,,False,Aaron Duffy began employing visual tricks into...,,,


In [None]:

df['clean_transcript'] = df['transcript'].apply(clean)

# Split the cleaned transcripts into words
doc_clean = [doc.split() for doc in df['clean_transcript']] 

# Creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Running and Training LDA model on the document term matrix
lda = LdaModel
ldamodel = lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

In [59]:
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x290435cd870>

In [77]:
import gensim.downloader as api
# Load the GloVe model
glove_model = api.load('glove-wiki-gigaword-300')

# Load the FastText model
#fasttext_model = api.load('fasttext-wiki-news-subwords-300')


In [78]:
def preprocess(text):
    text = TextBlob(text).correct()
    tokens = gensim.utils.simple_preprocess(str(text))
    stop_words = gensim.parsing.preprocessing.STOPWORDS
    tokens = [token for token in tokens if token not in stop_words]
    nlp = spacy.load('en_core_web_sm')
    tokens = [token.lemma_ for token in nlp(' '.join(tokens))]
    tokens = nltk.pos_tag(tokens)
    tokens = [token.ent_type_ for token in nlp(' '.join([word for word, tag in tokens]))]
    sentences = [tokens]

    # Perform phrase detection on the sentences using gensim
    phrase_model = gensim.models.Phrases(sentences)
    bigram_sentences = phrase_model[sentences]

    # Get the word vectors of the GloVe model
    word_vectors = glove_model.wv

    return word_vectors


# Apply the preprocess function to the transcript column
data['tokens'] = data['transcript'].apply(preprocess)

# Create a dictionary from the tokens
dictionary = corpora.Dictionary(data['tokens'])

# Create a corpus from the tokens
corpus = [dictionary.doc2bow(tokens) for tokens in data['tokens']]

# Define the number of topics
num_topics = 5

# Create an LDA model using the corpus and the dictionary
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)

# Print the topics and their keywords
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


# Compute the coherence score of the model
coherence_model = CoherenceModel(model=lda_model, texts=data['tokens'], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print('The coherence score of the model is: ' + str(coherence_score))



### Finding the optimal number of topics

In [55]:
num_topics_list = [3, 10, 17, 24,30]

# Create an empty list to store the coherence scores
coherence_scores = []

# For each value of num_topics, create an LDA model and calculate the coherence score
for num_topics in num_topics_list:
    # Create an LDA model using the corpus and the dictionary
    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10, random_state=42)
    # Compute the coherence score of the model
    coherence_model = CoherenceModel(model=lda_model, texts=data['tokens'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    # Append the coherence score to the list
    coherence_scores.append(coherence_score)

# Print the coherence scores for each value of num_topics
for i in range(len(num_topics_list)):
    print('The coherence score for num_topics = ' + str(num_topics_list[i]) + ' is: ' + str(coherence_scores[i]))


# Find the value of num_topics that gives the highest coherence score
best_num_topics = num_topics_list[coherence_scores.index(max(coherence_scores))]
print('The best number of topics for the transcript column is: ' + str(best_num_topics))

The coherence score for num_topics = 3 is: 0.28353645024499924
The coherence score for num_topics = 10 is: 0.3672612294401866
The coherence score for num_topics = 17 is: 0.411750180324153
The coherence score for num_topics = 24 is: 0.4180430414139158
The coherence score for num_topics = 30 is: 0.4304144151675676
The best number of topics for the transcript column is: 30
