# Critical Role dashboard script

This is the accompanying script for crtranscriptexplorer.herokuapp.com. In it, I take a file a of text files (subtitles from Critical Role, a popular Dungeons and Dragons vodcast), clean and process the words, and output a graph showing how frequently each word or phrase is used. First we load all of the necessary packages and build an NLP pipeline.

## NLP Pipeline

In [98]:
import sys
import os
import io
import re
import praw
import spacy
import glob
import nltk
import pandas as pd
import numpy as np
import pickle
from nltk.corpus import stopwords
stop_words =  set(stopwords.words('english'))

def is_punct_space(token):
    return token.is_punct or token.is_space

nlp = spacy.load('en')
direc = '../FinalTextFiles/'
newStopWords = ['-PRON-']

import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

plotly.tools.set_credentials_file(username='???', api_key='???')

In [99]:
#Open and read the txt into dataframes
os.chdir(direc)
text = []
ffiles = [f for f in os.listdir(direc) if os.path.isfile(f)]
for f in ffiles:
  with io.open (f, "r", encoding = 'cp437') as myfile:
    text.append(myfile.read())

data = pd.DataFrame({'raw':text})

In [100]:
#This code is adapted from http://acsweb.ucsd.edu/~btomosch/ironfist.html
#This function cleans up the dataframe, removes weird characters and spaces
def cleaning_pipeline(data):
    data = data.reset_index(drop=True)
    #Turn line breaks into soaces
    data['raw'] = [r.replace('\n', ' ') for r in data['raw']]
    data['raw'] = [r.replace('ΓÇÖ', "'") for r in data['raw']]
    data['raw'] = data['raw'].astype(str)
    #Split files by searching for Names in all uppercase + :
    data2 = data.iloc[2:]
    data2['split'] = [re.split(r"\s(?=[A-Z]+:)",x) for x in data2.raw]
    #Add episodes
    data3 = pd.DataFrame(data2.split.values.tolist(), index= data2.index)
    data3['episode'] = data3.index - 1
    data4 = pd.melt(data3, id_vars=['episode'])
    data4['value'].str.split(':',1, expand = True)
    data5 = pd.concat([data4, data4['value'].str.split(':',1, expand = True)], axis=1)
    data5 = data5.dropna().rename(index=str, columns={0: "Speaker", 1: "Speech"})
    data5.drop(['variable', 'value'], axis=1, inplace = True)
    data,data2,data3,data4 = None,None,None,None
    return data5

#This parses all of the text and removes custom stop words
def nlp_pipeline(data):
    
    data['Speech'].str.replace(r"\(.*\)","")
    data['Speech'].str.replace(r"\[.*\]","")
    #Parse and lemmatize the data
    data['parsed'] = [ nlp(x) for x in data.Speech]
    data['lemmatized'] = [[token.lemma_ for token in x
                            if not is_punct_space(token)] 
                  for x in data.parsed]
    
    #Remove Stop words
    data['cleaned'] = [[term for term in x
                                if not term in newStopWords]
                               for x in data.lemmatized]
    #Collapse into one string rather than list
    data.cleaned = [' '.join(x) for x in data.cleaned]
    
    return data

In [101]:
data = cleaning_pipeline(data)
data = nlp_pipeline(data)
data.head()

Unnamed: 0,episode,Speaker,Speech,parsed,lemmatized,cleaned
0,1,MATT,"Hello everyone, and welcome to the second epi...","( , Hello, everyone, ,, and, welcome, to, the,...","[hello, everyone, and, welcome, to, the, secon...",hello everyone and welcome to the second episo...
1,2,MATT,"Hey, everyone. Sorry about that little issue ...","( , Hey, ,, everyone, ., Sorry, about, that, l...","[hey, everyone, sorry, about, that, little, is...",hey everyone sorry about that little issue the...
2,3,MATT,"Everyone, welcome to the new episode of Criti...","( , Everyone, ,, welcome, to, the, new, episod...","[everyone, welcome, to, the, new, episode, of,...",everyone welcome to the new episode of critica...
3,4,MATT,"Hello everyone, welcome to the fifth episode ...","( , Hello, everyone, ,, welcome, to, the, fift...","[hello, everyone, welcome, to, the, fifth, epi...",hello everyone welcome to the fifth episode of...
4,5,MATT,Hello everyone! Welcome to Critical Role toni...,"( , Hello, everyone, !, Welcome, to, Critical,...","[hello, everyone, welcome, to, critical, role,...",hello everyone welcome to critical role tonigh...


Ok! The above step takes a long time, since we're building a dataframe with multiple copies of the parsed data in it. Only the episode, speaker, and cleaned columns are ultimately necessary, but I like showing the process from raw text to a single string of language data.

## Build a dataframe of the counts for each word and speaker

In [102]:
#The words we want to search for in the text
words = ['elemental','uncanny dodge','bad news','rage']

#The people we want to include in the analysis
peeps = ['SAM','LAURA','TRAVIS','MATT','LIAM','TALIESIN','MARISHA', 'ASHLEY']


#Use this to pickle the data - the dash code uses this
#data[['episode','Speaker','cleaned']].to_pickle("../cleandata.pk1")

#Use this to filter out episodes
data_subset = data[data['episode'] >= 1][data['episode'] <= 118]

You only need to go to here and pickle the data if you use the Dash script to generate a python app.

In [103]:
#Merge all episodes into one string per speaker
fullmerge = {}
for peep in peeps:
    fullmerge.update({peep:' '.join(data_subset[data_subset.Speaker == peep]['cleaned'])})
finaldata = pd.DataFrame.from_dict(fullmerge, orient = 'index').reset_index().rename(columns = {'index':'speaker',0:'text'})
finaldata.head()

Unnamed: 0,speaker,text
0,SAM,go log on to geekandsundry.com slash find on t...
1,LAURA,cassandra what be do everybody except for some...
2,TRAVIS,right listen up if have ale then have a friend...
3,MATT,hello everyone and welcome to the second episo...
4,LIAM,between 1:00 and 4:00 be when be do stuff be a...


In [104]:
#Create a dataframe and populate it with the numbers for each of the above words for each person
freq = pd.DataFrame()
for peep in peeps:
    for word in words:
        freq = freq.append(pd.Series([peep, word, finaldata[finaldata['speaker'] == peep].text.str.count(word).iloc[0]]), ignore_index = True)
        
freq = freq.rename(index=str, columns={0: "speaker", 1: "word",2: "amount"})

In [111]:
#Calculate the total number of words spoken by each speaker - estimated as the number of spaces.
totalwords = pd.DataFrame()
for peep in peeps:
    totalwords = totalwords.append(pd.Series([peep, ' ', finaldata[finaldata['speaker'] == peep].text.str.count(' ').iloc[0]]), ignore_index = True)

totalwords = totalwords.rename(index=str, columns={0: "speaker", 1: "word",2: "total"})

freq = pd.merge(freq, totalwords[['speaker','total']], on='speaker')

#Calculate the rate of times the word is said per 1000 words
freq['rate'] = (freq['amount']/freq['total'])*1000
totalwords = None

#sort by rate and speaker
freq = freq.sort_values(by=['rate','speaker'], ascending = [False,True])
freq.head()

Unnamed: 0,speaker,word,amount,total_x,rate,total_y,total_x.1,total_y.1,total
3,TRAVIS,rage,209.0,188049.0,1.111412,188049.0,188049.0,188049.0,188049.0
19,MARISHA,elemental,267.0,253320.0,1.054003,253320.0,253320.0,253320.0,253320.0
7,TALIESIN,bad news,114.0,242837.0,0.469451,242837.0,242837.0,242837.0,242837.0
27,LIAM,uncanny dodge,75.0,228410.0,0.328357,228410.0,228410.0,228410.0,228410.0
23,MATT,elemental,435.0,1648647.0,0.263853,1648647.0,1648647.0,1648647.0,1648647.0


And now we have a dataframe that shows the frequency and rates of each word for each speaker. Lastly we will make the graph with plotly, since the dashboard package (Dash) uses plotly.

In [109]:
fig = ff.create_facet_grid(
    freq,
    x='rate',
    y='word',
    facet_col='speaker',
    color_name='speaker',
    trace_type='bar',
    orientation = 'h',
    ggplot2 = True
)

#Change the axes to match the minimum and 15% over the maximum rates for nice graphing
for i in range(len(peeps)+1):
    if i == 0:
        fig.layout.xaxis.update({'range': [freq['rate'].min(), (freq['rate'].max()+(.15 * freq['rate'].max()))]})
    else:
        exec('fig.layout.xaxis' + str(i)+".update({'range': [freq['rate'].min(), (freq['rate'].max()+(.15 * freq['rate'].max()))]})")

fig.layout.update(plot_bgcolor='rgba(230,230,230,90)')
py.iplot(fig, filename='critrole')

See github.com/tomoschuk for the accompanying Dash script that generates crtranscript.herokuapp.com!