# Topic modelling analysis of interview transcripts

Load the relevant modules the NLTK stopwords corpus, plus add some custom stopwords

In [13]:
import gensim
import logging
import re
from pprint import pprint
from unidecode import unidecode

# spacy for lemmatization
import spacy

# Import NLTK stop words and add some additional ones
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

Our ASCII data file of collated interview transcripts with interviewer text removed

In [14]:
data_file = "/home/main/Dropbox/data/HtVTranscripts/text/interviewee_megafile.txt"

Read in the file and do some pre-processing of the text.

Namely, converting everything to unicode, making everything lower case, and removing lots of extraneous characters a defined in _spchars_


In [15]:
# Text normalisation
#
def normalize(text):
    spchars = re.compile('\â€¦\`|\~|\!|\@|\#|\$|\%|\^|\&|\*|\(|\)|\_|\+|\=|\\|\||\{|\[|\]|\}|\:|\;|\'|\"|\<|\,|\>|\?|\/|\.|\-')
    text = unidecode(text.decode('utf-8', 'ignore'))    # convert to unicode
    text = text.lower()                                 # convert to lowercase
    text = spchars.sub(" ", text)                       # remove special characters referenced in spchars
    return(text)

# Read input file, normalise
#
def read_input(input_file):
    with open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            if line[0] == 40:                           # Skip any lines than begin with '(' - transcriber's time stamps
                continue
            line = normalize(line)
            yield gensim.utils.simple_preprocess(line)  # Pre-process and return list of words for each line

documents = list(read_input(data_file))

Remove stopwords. Stopwords are common words like 'the', 'is', 'at', 'which' and so on that are high frequency but don't hold a lot of conceptual meaning and so are removed from vector calculations.

We also extend the stopwords list with some common ones from the transcript itself

In [16]:
# Remove stop words
#
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

stop_words.extend(['ehm', 'like', 'kinda', 'eh', 'yeah', 'mm', 'roxy'])
documents = remove_stopwords(documents)

Lemmatise corpus. A lemma is the root form of a word. e.g. The lemma for 'was' is 'be'. The lemma for 'apples' is 'apple'. This stops words in the transcript which are conceptually similar or identical ('apple' and 'apples') from being treated as distinct entities.

In [17]:
nlp = spacy.load('en', disable=['ner', 'parser'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

documents = lemmatization(documents)

Creation dictionary from corpus and get token frequency

In [19]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(documents)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in documents]

Build LDA model

In [24]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

Print topics with association scores

In [25]:
doc_lda = lda_model[corpus]
pprint(lda_model.print_topics())

[(0,
  '0.133*"basically" + 0.114*"sister" + 0.097*"home" + 0.080*"hospital" + '
  '0.036*"music" + 0.030*"live" + 0.025*"attack" + 0.023*"completely" + '
  '0.023*"corner" + 0.022*"rather"'),
 (1,
  '0.121*"feel" + 0.088*"something" + 0.068*"always" + 0.064*"tell" + '
  '0.056*"lot" + 0.056*"want" + 0.037*"stuff" + 0.034*"way" + 0.029*"right" + '
  '0.024*"usually"'),
 (2,
  '0.187*"get" + 0.181*"know" + 0.111*"kind" + 0.060*"sort" + 0.050*"make" + '
  '0.019*"speak" + 0.018*"face" + 0.018*"obviously" + 0.017*"first" + '
  '0.015*"brain"'),
 (3,
  '0.160*"would" + 0.105*"really" + 0.056*"bad" + 0.040*"look" + 0.039*"head" '
  '+ 0.036*"still" + 0.036*"remember" + 0.033*"day" + 0.032*"stop" + '
  '0.031*"fine"'),
 (4,
  '0.167*"go" + 0.112*"say" + 0.092*"thing" + 0.049*"time" + 0.049*"see" + '
  '0.037*"sometimes" + 0.035*"take" + 0.024*"happen" + 0.021*"little" + '
  '0.019*"everything"'),
 (5,
  '0.179*"quite" + 0.133*"work" + 0.109*"mean" + 0.067*"tend" + 0.045*"able" + '
  '0.023*"

Visualise

In [26]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis