# Critical Role dashboard script

This is the accompanying script for crtranscriptexplorer.herokuapp.com. In it, I take a file a of text files (subtitles from Critical Role, a popular Dungeons and Dragons vodcast), clean and process the words, and output a graph showing how frequently each word or phrase is used. First we load all of the necessary packages and build an NLP pipeline.

Data can be found at crtranscript.tumblr.com. Major credit goes to that group (unaffiliated) for annotating all of the episodes.

## NLP Pipeline

In [1]:
import sys
import os
import io
import re
import spacy
import nltk
import pandas as pd
import numpy as np
import scipy as sp
import pickle
from nltk.corpus import stopwords
stop_words =  set(stopwords.words('english'))

def is_punct_space(token):
    return token.is_punct or token.is_space

nlp = spacy.load('en')
direc = '../FinalTextFiles/'
newStopWords = ['-PRON-']

import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='btomoschuk', api_key='EDkuHeNiiXQBG92SPAz2')

In [2]:
#Open and read the txt into dataframes
os.chdir(direc)
text = []
ffiles = [f for f in os.listdir(direc) if os.path.isfile(f)]
for f in ffiles:
  with io.open (f, "r", encoding = 'cp437') as myfile:
    text.append(myfile.read())

data = pd.DataFrame({'raw':text})

In [3]:
#This code is adapted from http://acsweb.ucsd.edu/~btomosch/ironfist.html
#This function cleans up the dataframe, removes weird characters and spaces
def cleaning_pipeline(data):
    data = data.reset_index(drop=True)
    #Turns line breaks into spaces and replaces messed up apostrophe
    data['raw'] = [r.replace('\n', ' ') for r in data['raw']]
    data['raw'] = [r.replace('ΓÇÖ', "'") for r in data['raw']]
    data['raw'] = data['raw'].astype(str)
    #Split files by searching for Names in all uppercase + :
    data = data.iloc[1:]
    data['split'] = [re.split(r"\s(?=[A-Z]+:)",x) for x in data['raw']]
    #Add episodes
    data = pd.DataFrame(data.split.values.tolist(), index= data.index)
    data['episode'] = data.index
    data = pd.melt(data, id_vars=['episode'])
    data = pd.concat([data, data['value'].str.split(':',1, expand = True)], axis=1)
    data = data[['episode',0,1]].dropna().rename(index=str, columns={0: "Speaker", 1: "Speech"})
    data['Speech'].str.replace(r"\(.*\)","")
    return data

#This parses all of the text and removes custom stop words
def nlp_pipeline(data):
    #Parse and lemmatize the data
    data['parsed'] = [ nlp(x) for x in data.Speech]
    data['lemmatized'] = [[token.lemma_ for token in x
                            if not is_punct_space(token)] 
                  for x in data.parsed]
    
    #Remove Stop words
    data['cleaned'] = [[term for term in x
                                if not term in newStopWords]
                               for x in data.lemmatized]
    #Collapse into one string rather than list
    data.cleaned = [' '.join(x) for x in data.cleaned]
    
    return data

In [4]:
data = cleaning_pipeline(data)
data = nlp_pipeline(data)
data.head(15)

Unnamed: 0,episode,Speaker,Speech,parsed,lemmatized,cleaned
0,1,MATT,"Hello everyone. My name is MATThew Mercer, ...","( , Hello, everyone, ., My, name, is, MATThew,...","[hello, everyone, -PRON-, name, be, matthew, m...",hello everyone name be matthew mercer voice ac...
1,2,MATT,"Hello everyone, and welcome to the second epi...","( , Hello, everyone, ,, and, welcome, to, the,...","[hello, everyone, and, welcome, to, the, secon...",hello everyone and welcome to the second episo...
2,3,MATT,"Hey, everyone. Sorry about that little issue ...","( , Hey, ,, everyone, ., Sorry, about, that, l...","[hey, everyone, sorry, about, that, little, is...",hey everyone sorry about that little issue the...
3,4,MATT,"Everyone, welcome to the new episode of Criti...","( , Everyone, ,, welcome, to, the, new, episod...","[everyone, welcome, to, the, new, episode, of,...",everyone welcome to the new episode of critica...
4,5,MATT,"Hello everyone, welcome to the fifth episode ...","( , Hello, everyone, ,, welcome, to, the, fift...","[hello, everyone, welcome, to, the, fifth, epi...",hello everyone welcome to the fifth episode of...
5,6,MATT,Hello everyone! Welcome to Critical Role toni...,"( , Hello, everyone, !, Welcome, to, Critical,...","[hello, everyone, welcome, to, critical, role,...",hello everyone welcome to critical role tonigh...
6,7,MATT,"Hello, and welcome to this evening's episode ...","( , Hello, ,, and, welcome, to, this, evening,...","[hello, and, welcome, to, this, evening, 's, e...",hello and welcome to this evening 's episode o...
7,8,MATT,"Hello and good evening, everyone. Welcome to ...","( , Hello, and, good, evening, ,, everyone, .,...","[hello, and, good, evening, everyone, welcome,...",hello and good evening everyone welcome to ton...
8,9,MATT,"Welcome and good evening, everyone. Welcome t...","( , Welcome, and, good, evening, ,, everyone, ...","[welcome, and, good, evening, everyone, welcom...",welcome and good evening everyone welcome to t...
9,10,MATT,"Welcome, everyone, to this Thursday's episod...","( , Welcome, ,, everyone, ,, to, this, , Thur...","[welcome, everyone, to, this, thursday, 's, ep...",welcome everyone to this thursday 's episode o...


Ok! The nlp_pipeline function takes a long time, since we're building a dataframe with multiple copies of the parsed data in it. Only the episode, speaker, and cleaned columns are ultimately necessary, but I like showing the process from raw text to a single string of language data.

## Build a dataframe of the counts for each word and speaker

In [48]:
#The words we want to search for in the text
words = [string.lower() for string in ['dungeon','dragon','dice','weapon','attack','oh no', 'awesome']]

#The people we want to include in the analysis
peeps = [string.upper() for string in ['Sam','Laura','Travis','Matt','Liam','Taliesin','Marisha', 'Ashley']]


#Use this to pickle the data - the dash code uses this file
#data[['episode','Speaker','cleaned']].to_pickle("../cleandata.pk1")

#Use this to filter out episodes
data_subset = data[data['episode'] >= 1][data['episode'] <= 115]

You only need to go to here and pickle the data if you use the Dash script to generate a python app.

In [49]:
#Merge all episodes into one string per speaker using a dictionary
fullmerge = {}
for peep in peeps:
    fullmerge.update({peep:' '.join(data_subset[data_subset.Speaker == peep]['cleaned'])})
finaldata = pd.DataFrame.from_dict(fullmerge, orient = 'index').reset_index().rename(columns = {'index':'speaker',0:'text'})
finaldata.head()

Unnamed: 0,speaker,text
0,SAM,go log on to geekandsundry.com slash find on t...
1,LAURA,everybody except for some people oh have alrea...
2,TRAVIS,right listen up if have ale then have a friend...
3,MATT,hello everyone name be matthew mercer voice ac...
4,LIAM,between 1:00 and 4:00 be when be do stuff be a...


In [50]:
#Create a dataframe and populate it with the numbers for each of the above words for each person
freq = pd.DataFrame()
for peep in peeps:
    for word in words:
        freq = freq.append(pd.Series([peep, word, finaldata[finaldata['speaker'] == peep].text.str.count(word).iloc[0]]), ignore_index = True)
        
freq = freq.rename(index=str, columns={0: "speaker", 1: "word",2: "amount"})

In [51]:
#Calculate the total number of words spoken by each speaker - estimated as the number of spaces.
totalwords = pd.DataFrame()
for peep in peeps:
    totalwords = totalwords.append(pd.Series([peep, ' ', finaldata[finaldata['speaker'] == peep].text.str.count(' ').iloc[0]]), ignore_index = True)

totalwords = totalwords.rename(index=str, columns={0: "speaker", 1: "word",2: "total"})

freq = pd.merge(freq, totalwords[['speaker','total']], on='speaker')

#Calculate the rate of times the word is said per 1000 words
freq['rate'] = (freq['amount']/freq['total'])*1000
totalwords = None

#sort by rate and speaker
freq = freq.sort_values(by=['rate','speaker'], ascending = [True,False])
freq.head()

Unnamed: 0,speaker,word,amount,total,rate
49,ASHLEY,dungeon,1.0,54439.0,0.018369
7,LAURA,dungeon,6.0,290054.0,0.020686
14,TRAVIS,dungeon,4.0,187851.0,0.021293
42,MARISHA,dungeon,13.0,252966.0,0.05139
26,MATT,oh no,117.0,1657724.0,0.070579


And now we have a dataframe that shows the frequency and rates of each word for each speaker. Lastly we will make the graph with plotly, since the dashboard package (Dash) uses plotly.

In [52]:
fig = ff.create_facet_grid(
    freq,
    x='rate',
    y='word',
    facet_col='speaker',
    color_name='speaker',
    trace_type='bar',
    orientation = 'h',
    ggplot2 = True
)

#Change the axes to match the minimum rate and 15% over the maximum rate for nice graphing
for i in range(len(peeps)+1):
    if i == 0:
        fig.layout.xaxis.update({'range': [freq['rate'].min(), (freq['rate'].max()+(.15 * freq['rate'].max()))]})
    else:
        exec('fig.layout.xaxis' + str(i)+".update({'range': [freq['rate'].min(), (freq['rate'].max()+(.15 * freq['rate'].max()))]})")

fig.layout.update(plot_bgcolor='rgba(230,230,230,90)')
py.iplot(fig, filename='critrole')

In [93]:
#Apply the tfidf function, making sure that at least 2 people haven't said the word befores
tfidf = TfidfVectorizer(stop_words='english', vocabulary = words)
tfs = tfidf.fit_transform(finaldata['text'])
matrix = pd.DataFrame(tfs.todense(), index = peeps, columns = tfidf.get_feature_names()).transpose()
matrix['word'] = matrix.index
matrix = pd.melt(matrix, id_vars = 'word')
freq = freq.rename(index=str, columns={'value': "tfidf",'variable': "speaker"})
distWord = matrix.loc[matrix['tfidf'].idxmax()]['word']
distSpeaker = matrix.loc[matrix['tfidf'].idxmax()]['word']

See github.com/tomoschuk for the accompanying Dash script that generates crtranscript.herokuapp.com!

next steps!