In [137]:
from bs4 import BeautifulSoup
import requests
import re
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')

class WebScrapper:
    def __init__(self, url):
        headers = requests.utils.default_headers()
        headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'})
        req = requests.get(url, headers)
        self.soup = BeautifulSoup(req.content, 'html.parser')
        
        
    def scrape_all(self, tag,  tokenize=None):
        [s.extract() for s in self.soup('a')]
        full_speech = ""

        for p in self.soup.find_all(tag):
            speech_parts = p.get_text().split("\n")
            first_line = speech_parts[0]

            # only include speech from Donald Trump, exclude crowd and other speakers
            if first_line == "Donald Trump: ()" or first_line == "President Trump: ()":
                full_speech = full_speech + " " + speech_parts[1]
            
            else:
                full_speech += speech_parts[0]
        
        full_speech = self._clean_transcript(full_speech)
        if tokenize:
            full_speech = self.tokenize(full_speech)
            return full_speech
        

        return full_speech
    
    
    def _clean_transcript(self, text):
        # remove bracketed text
        text = re.sub('\[.*?\]', '', text)
        # remove non-word and non-space characters (i.e. punctuation)
        text = re.sub('[^\w\s]', '', text)
        # remove numbers
        text = ''.join([i for i in text if not i.isdigit()])
        # change multiple spaces back to single space
        text = re.sub(' +', ' ', text)
        # lowercase
        text = text.lower()
        # strip leading and trailing spaces
        text = text.strip()

        return text

    
    def tokenize(self, content, amount_features=None, regex = '\w+'):
        tokenized = word_tokenize(content)
        return tokenized

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [149]:
def addToDataFrame(content, label=None):
    rally_df = pd.DataFrame(content)
    rally_df.columns = ['transcript']
    rally_df['type'] = label
    print("Added to dataset!")
    return rally_df
          
def extract_urls_from(txt_file):
    url_files = open(txt_file,'r')
   
    for whole_file in url_files:
        current_url = whole_file.replace('\n', '')
        print("Scrapping:", current_url)
        yield current_url
    url_files.close()


RALLY_TRANSCRIPTS, UNION_TRANSCRIPTS = [], []
for current_url in extract_urls_from('rally_urls.txt'):
    full_speech = WebScrapper(current_url).scrape_all('p', tokenize=False)
    RALLY_TRANSCRIPTS.append(full_speech)
    
for current_url in extract_urls_from('union.txt'):
    full_speech = WebScrapper(current_url).scrape_all('p', tokenize=False)
    UNION_TRANSCRIPTS.append(full_speech)



        
    
rally_df = addToDataFrame(RALLY_TRANSCRIPTS,label = 1)
union_df = addToDataFrame(UNION_TRANSCRIPTS, label = 0)
df = pd.concat([rally_df, union_df])
    
df.head()

Scrapping: https://www.rev.com/blog/transcripts/donald-trump-charlotte-north-carolina-rally-transcript-trump-holds-rally-before-super-tuesday
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-charleston-south-carolina-rally-transcript-february-28-2020
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-las-vegas-nevada-rally-transcript
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-colorado-springs-co-rally-transcript
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-phoenix-arizona-rally-transcript
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-new-hampshire-rally-february-10-2020
Scrapping: https://www.rev.com/blog/transcripts/donal-trump-iowa-rally-transcript-trump-holds-rally-in-des-moines-iowa
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-new-jersey-rally-speech-transcript-trump-holds-rally-in-wildwood-nj
Scrapping: https://www.rev.com/blog/transcripts/donald-trump-milwaukee-rally-transcript-trump-holds-rall

Unnamed: 0,transcript,type
0,mar donald trump held a rally on march in char...,1
1,feb donald trump held a rally in south carolin...,1
2,feb donald trump held a rally in las vegas nev...,1
3,feb donald trump held his second rally in two ...,1
4,feb donald trump held another campaign rally i...,1


In [144]:
df

Unnamed: 0,transcript,type
