In [7]:

#####################################                   Data Extraction                  ######################################

## Importing libraries
import requests as req
from bs4 import BeautifulSoup as bs
import pickle
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer


## Function for scraping the stand-up transcripts
def extract_transcript(url):
    page = req.get(url).text
    soup = bs(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text


## Extracting the transcripts from scrapsfromtheloft.com
urls = ['https://scrapsfromtheloft.com/2019/07/10/aziz-ansari-right-now-transcript/',
        'https://scrapsfromtheloft.com/2018/12/15/vir-das-losing-it-transcript/',
        'https://scrapsfromtheloft.com/2020/05/01/daniel-sloss-x-transcript/',
        'https://scrapsfromtheloft.com/2019/11/30/mike-birbiglia-the-new-one-transcript/',
        'https://scrapsfromtheloft.com/2019/01/29/sebastian-maniscalco-stay-hungry-transcript/',
        'https://scrapsfromtheloft.com/2019/05/22/wanda-sykes-not-normal-transcript/',
        'https://scrapsfromtheloft.com/2019/01/30/gabriel-fluffy-iglesias-one-show-fits-all-transcript/',
        'https://scrapsfromtheloft.com/2019/11/08/seth-meyers-lobby-baby-transcript/',
        'https://scrapsfromtheloft.com/2018/10/26/adam-sandler-100-fresh-transcript/',
        'https://scrapsfromtheloft.com/2018/05/15/ali-wong-hard-knock-wife-full-transcript/',
        'https://scrapsfromtheloft.com/2018/05/05/john-mulaney-kid-gorgeous-at-radio-city-full-transcript/',
        'https://scrapsfromtheloft.com/2018/07/21/hannah-gadsby-nanette-transcript/',
        'https://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/']

comedian_names = ['Aziz Ansari', 'Vir Das', 'Daniel Sloss', 'Mike Birbiglia', 'Sebastian Maniscalco', 'Wanda Sykes', 'Gabriel “Fluffy” Iglesias', 'Seth Meyers', 'Adam Sandler', 'Ali Wong', 'John Mulaney', 'Hannah Gadsby', 'Hasan Minhaj']

transcripts = [extract_transcript(url) for url in urls]


## Storing the transcripts on local machines in different text files
for i, cname in enumerate(comedian_names):
    with open("stand-up transcripts/" + cname + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

https://scrapsfromtheloft.com/2019/07/10/aziz-ansari-right-now-transcript/
https://scrapsfromtheloft.com/2018/12/15/vir-das-losing-it-transcript/
https://scrapsfromtheloft.com/2020/05/01/daniel-sloss-x-transcript/
https://scrapsfromtheloft.com/2019/11/30/mike-birbiglia-the-new-one-transcript/
https://scrapsfromtheloft.com/2019/01/29/sebastian-maniscalco-stay-hungry-transcript/
https://scrapsfromtheloft.com/2019/05/22/wanda-sykes-not-normal-transcript/
https://scrapsfromtheloft.com/2019/01/30/gabriel-fluffy-iglesias-one-show-fits-all-transcript/
https://scrapsfromtheloft.com/2019/11/08/seth-meyers-lobby-baby-transcript/
https://scrapsfromtheloft.com/2018/10/26/adam-sandler-100-fresh-transcript/
https://scrapsfromtheloft.com/2018/05/15/ali-wong-hard-knock-wife-full-transcript/
https://scrapsfromtheloft.com/2018/05/05/john-mulaney-kid-gorgeous-at-radio-city-full-transcript/
https://scrapsfromtheloft.com/2018/07/21/hannah-gadsby-nanette-transcript/
https://scrapsfromtheloft.com/2017/10/21/

In [8]:

#####################################                 Data Pre-processing               ######################################

## Retrieving data from the local text files
data = {}
for i, cname in enumerate(comedian_names):
    with open("stand-up transcripts/" + cname + ".txt", "rb") as file:
        data[cname] = pickle.load(file)
        
        
## Formatting the transcript in a single string
def combine_text(list_of_text):
    combined_text = ' '.join(list_of_text)
    return combined_text

data_combined = {key: [combine_text(value)] for (key, value) in data.items()}


## Storing the data in a data-frame
pd.set_option('max_colwidth', 150)
data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['Transcript']
data_df = data_df.sort_index()


## Indicator to see if things are working as expected
data_df

Unnamed: 0,Transcript
Adam Sandler,"[man] Okay, ready, and… Take your own cue, Adam. And action, Dan! [piano plays] ♪ It was the perfect day ♪\n♪ You had the sweetest smile ♪\n♪ You ..."
Ali Wong,"Ladies and gentlemen, please welcome to the stage Ali Wong! ♪ What y’all thought Y’all wasn’t gon’ see me? ♪\n♪ I’m the Osirus of this shit♪\n♪ Wu..."
Aziz Ansari,"♪ Sometimes I feel so happy ♪\n♪ Sometimes I feel so sad ♪\n♪ Sometimes I feel so happy ♪\n♪ But mostly you just make me mad ♪\n♪ Baby, you just m..."
Daniel Sloss,A man offered us a billion dollars. – Bolt the doors. – We could all go to prison. Enjoy the injunction. Did you just try to punch the wall and mi...
Gabriel “Fluffy” Iglesias,"[crowd chanting] Fluffy! Fluffy! [Fluffy shouts] Houston! [crowd cheering] [audience] Five, four, three, two, one! [cheering]\n♪ Macho, macho man ..."
Hannah Gadsby,[“Bobby Reid” plays]\n♪ There’s blood in the water ♪\n♪ Won’t you cut me down? ♪\n♪ ‘Cause people keep on calling ♪\n♪ Won’t you cut me down? ♪\n♪...
Hasan Minhaj,"[theme music: orchestral hip-hop] [crowd roars] What’s up? Davis, what’s up? I’m home. I had to bring it back here. Netflix said, “Where do you wa..."
John Mulaney,[organ music playing] Welcome to Radio City Music Hall. It’s time. Any questions? No. Walk with me. [eerie organ music playing] [mechanical whirri...
Mike Birbiglia,"♪ Hey! I wanna get better ♪\n♪ I didn’t know I was lonely Till I saw your face ♪\n♪ I wanna get better ♪\n♪ Better, better, better I wanna get bet..."
Sebastian Maniscalco,"[instrumental music plays] ♪ All right ♪ [screeching] [cheering and applause] Beautiful New York City. [cheering and applause continues] Now, I’ve..."


In [9]:

#####################################                    Data Cleaning                   ######################################

## Function for round-1 of data-cleaning
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## Round-1 of data-cleaning
round1 = lambda x: clean_text_round1(x)
data_clean = pd.DataFrame(data_df.Transcript.apply(round1))


## Function for round-2 of data-cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('[^a-zA-Z ]', '', text)
    text = re.sub('\n', '', text)
    return text

## Round-2 of data-cleaning
round2 = lambda x: clean_text_round2(x)
data_clean = pd.DataFrame(data_clean.Transcript.apply(round2))


## Removing common English stop-words
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.Transcript)


## Storing data in a data-frame
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index

comedian_names = ['Adam Sandler', 'Ali Wong', 'Aziz Ansari', 'Daniel Sloss', 'Gabriel “Fluffy” Iglesias', 'Hannah Gadsby', 'Hasan Minhaj', 'John Mulaney', 'Mike Birbiglia', 'Sebastian Maniscalco', 'Seth Meyers', 'Vir Das', 'Wanda Sykes']

## Storing data-objects on the local machine for future use in another files
data_df['Full name'] = comedian_names
data_df.to_pickle("data_in_df.pkl")
data_dtm.to_pickle("data_in_dtm.pkl")
data_clean.to_pickle("data_clean.pkl")
pickle.dump(cv, open("cv.pkl", "wb"))


## Indicator to see if things are working as expected
data_dtm

Unnamed: 0,aah,aaliyah,abandoned,abdomen,abducted,abduction,abduljabbar,abhorrent,ability,abki,...,zip,ziploc,zipper,zippo,zips,zombies,zone,zoo,zoom,zyrtec
Adam Sandler,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,2,0,0
Ali Wong,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Aziz Ansari,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Daniel Sloss,3,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
Gabriel “Fluffy” Iglesias,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hannah Gadsby,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Hasan Minhaj,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
John Mulaney,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,1
Mike Birbiglia,0,0,0,1,0,0,0,0,0,0,...,1,0,1,0,1,2,0,0,2,0
Sebastian Maniscalco,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
