## Cleaning the data!
We start by importing our previously pickled file

In [1]:
import pickle
import pandas as pd
pd.set_option('max_colwidth',100)
import re
import string
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [2]:
# Load the file
infile = open('data_files/comedian_data_df','rb')
data = pickle.load(infile)
infile.close()
data.head()

Unnamed: 0,sketch_text,run_time,IMBD_rating,IMBD_ratings_number,year_released
Bo Burnham - Words Words Words,"(Cheers and applause)Thank you.(Laughter)When I say hey, you say ho.Hey.Ho!Hey.Ho!That’s basical...",63,8.2,2731,2010
Bo Burnham - What,Bo What? Old MacDonald had a farm E I E I O And on that farm he had a pig E I E I O Here a snort...,60,8.5,7555,2013
Bo Burnham - Make Happy,"[woman on TV] That has been, really, a difficult thing for me. My mother has always been a very ...",60,8.4,9087,2016
John Mulaney - New in Town,"[funky 90’s beat and cityscape pan][singing] ♬ New in town,[John Mulaney jumps out of apartment ...",60,8.3,5735,2012
John Mulaney - Comeback Kid,"Armed with boyish charm and a sharp wit, the former “SNL” writer John Mulaney offers sly takes o...",62,8.0,7352,2015


As previously mentioned, there are descriptions in brackets, punctuation and musical notes in some of the sketch transcripts. Therefore we will start by cleaning our text:

In [3]:
def clean_text_round(text, keep_brackets = True):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    # makes text lower case
    text = text.lower()
    # removes text in brackets
    if keep_brackets is False:
        text = re.sub('\[.*?\]', '', text)
        text = re.sub('\(.*?\)', '', text)
    # some of our extracts have profanity detailed using asterisks, so remove these without spaces to retain the word?    
    text = re.sub('\*', '', text)
    # removes standard punctuation but replaces with spaces to retain individual words
    text = re.sub("'", '', text)
    text = re.sub('[%s]+' % re.escape(string.punctuation), ' ', text)
    text = re.sub('–', ' ', text)
    text = re.sub(u"\u2026", ' ',text) # elipses
    # removes non ascii characters
    text = (text.encode('ascii', 'ignore')).decode()
    #remove all numbers (and words that had numbers in)
    text = re.sub('\w*\d\w*', '', text)
    # remove excess whitespace
    text = re.sub(' +', ' ', text)
    text = text.strip()
    # remove everything else including notably: apostrophies and quotation marks
    #clean_text_list = [char for char in text if char not in string.punctuation]
    #clean_text = ''.join(clean_text_list)
    return text

round1 = lambda x: clean_text_round(x, False)

In [4]:
# Apply my cleaning function to the data I have
data_clean = pd.DataFrame(data.sketch_text.apply(round1))
print(data_clean.iloc[1,0][:1000])
print(data_clean.iloc[4,0][:1000])

bo what old macdonald had a farm e i e i o and on that farm he had a pig e i e i o here a snort there a old macdonald had a farm e i e i o this is bo burnham hes years old hes a male and he looks like the genetic product of a giraffe having sex with ellen degeneres he has a gigantic head and tiny nipples hes isolated himself over the last years in pursuit of comedy and in doing so has lost touch with reality youre an asshole bo you hear me you think you know better than me you think you know better than everybody you will die alone and you will deserve it but in the meantime you might as well tell those silly jokes of yours see if that helps you used to do comedy when you felt like being funny but now youre contractially obligated so dance you fucking monkey dance monkey dance welcome to the show this is bo this is his show and bo likes to dance like this welcome to the show this is bo this is his show and bo takes off his pants like this play an invisible drum play an invisible trumpe

You can see from these extracts that our cleaning was successful, there is no longer any punctutation and the text is uniformly in lower case seperated by single spaces.

In [5]:
# Create the document term matrix from our data using the english stop words 
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.sketch_text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.index
data_dtm = data_dtm.transpose()
data_dtm.head()

Unnamed: 0,Bo Burnham - Words Words Words,Bo Burnham - What,Bo Burnham - Make Happy,John Mulaney - New in Town,John Mulaney - Comeback Kid,John Mulaney - Kid Gorgeous,Ricky Gervais - Politics,Ricky Gervais - Science,Ricky Gervais - Humanity,Kevin Bridges - Story So Far,Kevin Bridges - The Story Continues,Kevin Bridges - Whole Different Story,Aziz Ansari - Intimate Moments Sensual Evening,Aziz Ansari - Madison Square Garden,Aziz Ansari - Right Now
aaaaahhhhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaahhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaand,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
aaand,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Create a dictionary of the form {show name : list of most common tuples (word, # appearences)}
top_dict = {}
for show in data_dtm.columns:
    top = data_dtm[show].sort_values(ascending=False).head(40)
    top_dict[show]= list(zip(top.index, top.values))
print(top_dict['Bo Burnham - Words Words Words'])

[('like', 99), ('im', 83), ('funny', 58), ('cause', 50), ('yeah', 44), ('whats', 43), ('oh', 42), ('know', 38), ('dont', 37), ('just', 33), ('youre', 33), ('think', 28), ('said', 26), ('got', 23), ('bo', 23), ('people', 23), ('say', 21), ('little', 21), ('ill', 19), ('hate', 18), ('thank', 18), ('piece', 18), ('make', 17), ('love', 17), ('single', 16), ('artist', 15), ('did', 15), ('em', 15), ('want', 15), ('day', 14), ('called', 14), ('bit', 14), ('gay', 13), ('time', 13), ('right', 13), ('need', 12), ('look', 12), ('really', 12), ('women', 12), ('id', 12)]


These are the most common words used in Bo Burnhams 'Words Words Words' show. Note that the first few words don't seem to add much value to content of the show. Lets see if this is a common theme across the other comedians.

Note: I chose the top 40 words through some trial and error, above 40 the words flagged for removal start to have more contextual meaning.

In [7]:
for show, top_words in top_dict.items():
    print(show)
    print(', '.join([word for word, count in top_words[:14]]))
    print('---')

Bo Burnham - Words Words Words
like, im, funny, cause, yeah, whats, oh, know, dont, just, youre, think, said, got
---
Bo Burnham - What
know, like, love, think, im, just, bo, stuff, repeat, dont, yeah, want, right, cos
---
Bo Burnham - Make Happy
im, dont, know, like, just, right, think, want, youre, got, say, wouldnt, yeah, okay
---
John Mulaney - New in Town
like, im, know, just, dont, said, youre, people, new, thats, gonna, think, really, things
---
John Mulaney - Comeback Kid
like, know, just, dont, said, clinton, im, old, thats, right, youre, little, time, hey
---
John Mulaney - Kid Gorgeous
like, dont, im, said, just, know, people, going, youre, guy, horse, thats, say, wife
---
Ricky Gervais - Politics
just, went, hes, like, little, dont, got, thats, know, oh, think, im, youre, yeah
---
Ricky Gervais - Science
oh, like, just, right, im, dont, got, know, okay, going, thats, fat, theyre, fucking
---
Ricky Gervais - Humanity
right, like, just, im, dont, know, said, yeah, fucking, sa

It looks like there are a lot of common words, such as; like, just, im, know, right, etc. Therefore we'll remove words that are in the most common words for over half (8) of the different shows.

In [8]:
%%capture
# Add the top 40 words for each comedian into a list
top_words = []
for show in data_dtm.columns:
    top = [word for (word, count) in top_dict[show]]
    for stop_word in top:
        top_words.append(stop_word)
        
top_words

In [9]:
%%capture
# Count the number of appearences of each word in the list (count = the number of shows the word is in the top 30)
Counter(top_words).most_common()

In [10]:
# If more than half of the comedians have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(top_words).most_common() if count > 8]
print(add_stop_words)

['like', 'im', 'know', 'dont', 'just', 'youre', 'people', 'right', 'thats', 'oh', 'think', 'got', 'yeah', 'said', 'say', 'little', 'did', 'time', 'cause', 'hes']


In [11]:
# Add our new stop words to the sklearns existing english stop words list and remove some words that may be useful
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
extra_to_add = ['thing','gonna','really']
my_stop_words = []
for word in stop_words:
    if word not in ['bill','cry','together']:
        my_stop_words.append(word)
for word in extra_to_add:
    my_stop_words.append(word)

In [12]:
# Create a new document term matrix from our data using our expanded stop word list
cv = CountVectorizer(stop_words=my_stop_words)
data_cv = cv.fit_transform(data_clean.sketch_text)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.index
data_dtm = data_dtm.transpose()
data_dtm.head()

Unnamed: 0,Bo Burnham - Words Words Words,Bo Burnham - What,Bo Burnham - Make Happy,John Mulaney - New in Town,John Mulaney - Comeback Kid,John Mulaney - Kid Gorgeous,Ricky Gervais - Politics,Ricky Gervais - Science,Ricky Gervais - Humanity,Kevin Bridges - Story So Far,Kevin Bridges - The Story Continues,Kevin Bridges - Whole Different Story,Aziz Ansari - Intimate Moments Sensual Evening,Aziz Ansari - Madison Square Garden,Aziz Ansari - Right Now
aaaaahhhhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaahhhhh,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
aaaand,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
aaand,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# We can now look at our most common words again
top_dict = {}
for show in data_dtm.columns:
    top = data_dtm[show].sort_values(ascending=False).head(15)
    top_dict[show]= list(zip(top.index, top.values))

for show, top_words in top_dict.items():
    print(show)
    print(', '.join([word for word, count in top_words[:14]]))
    print('---')

Bo Burnham - Words Words Words
funny, whats, bo, ill, piece, thank, hate, make, love, single, artist, em, want, bit
---
Bo Burnham - What
love, bo, stuff, repeat, want, fucking, cos, eye, prolonged, contact, um, good, sluts, man
---
Bo Burnham - Make Happy
want, wouldnt, okay, make, dick, white, fucking, love, half, fit, uh, handle, life, good
---
John Mulaney - New in Town
new, things, old, york, hey, years, day, mean, look, ill, man, make, ive, lot
---
John Mulaney - Comeback Kid
bill, clinton, old, hey, mom, day, way, cow, wife, went, real, dad, didnt, big
---
John Mulaney - Kid Gorgeous
going, guy, horse, years, wife, old, okay, year, money, theyre, college, hey, life, day
---
Ricky Gervais - Politics
went, going, want, used, come, thought, old, ill, fucking, egg, good, sort, id, wolf
---
Ricky Gervais - Science
okay, going, fat, theyre, fucking, god, sort, um, went, mean, bit, ive, want, fuck
---
Ricky Gervais - Humanity
fucking, went, id, didnt, joke, theyre, ive, going, years, d

These look a bit better so we'll save our data again and begin exploring our data

In [14]:
data['sketch_text'] = data_clean['sketch_text']
outfile = open('data_files/comedian_corpus','wb')
pickle.dump(data, outfile)
outfile.close()

outfile = open('data_files/comedian_dtm','wb')
pickle.dump(data_dtm, outfile)
outfile.close()

outfile = open('data_files/stop_words','wb')
pickle.dump(my_stop_words, outfile)
outfile.close()