# Creating a training data set for classification of political offiliation

This notebook collects transcripts from presidential candidate debates and labels them with the appropriate 'democrat' or 'republican' label.

In [142]:
import nltk, re, pprint

from urllib import request
from bs4 import BeautifulSoup                                                                                   # needed for parsing HTML

import contractions                                                                                             # contractions dictionary
from string import punctuation

import spacy                                                                                                    # used for lemmatization/stemming
#!python -m spacy download en_core_web_sm                # OR in Jupyter download in terminal using spacy download en_core_web_sm

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
tokenizer = ToktokTokenizer()                                                                                   # stopword removal
from nltk import word_tokenize

import pandas as pd
import numpy as np  
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


# Download NLTK resources if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Initialize NLTK's WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

In [126]:
# transcripts from democratic debates for 2020 election

dem_urls = [
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-washington-dc',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-charleston-south-carolina-0',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-las-vegas-nevada-0',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-manchester-new-hampshire-0',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-des-moines-iowa-0',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-los-angeles-california',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-atlanta-georgia',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-westerville-ohio',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-houston-texas',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-detroit-michigan-group-2',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-detroit-michigan-group-1',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-miami-florida-group-2',
    'https://www.presidency.ucsb.edu/documents/democratic-candidates-debate-miami-florida-group-1'
]

# transcripts from republican debates for 2016 election

rep_urls = [
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-miami-florida',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-detroit-michigan',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-houston-texas',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-greenville-south-carolina',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-manchester-new-hampshire-0',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-des-moines-iowa-0',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-north-charleston-south-carolina',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-las-vegas-nevada-0',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-milwaukee-wisconsin',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-boulder-colorado',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-simi-valley-california-0',
    'https://www.presidency.ucsb.edu/documents/republican-candidates-debate-cleveland-ohio'
]

### Function for scraping from the urls and removing html

In [127]:
def scrape_and_format_text(url):
    try:
        # Send a request to the URL
        response = request.urlopen(url)
        # Read and decode the response
        raw = response.read().decode('utf-8-sig')
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(raw, 'html.parser')
        # Extract the text from the parsed HTML (you can customize this based on the website structure)
        text = soup.get_text()
        # Clean the text (add more cleaning steps if needed)
        cleaned_text = text.strip()
        return cleaned_text
    except Exception as e:
        # Handle exceptions if the URL request fails
        print(f"Error fetching URL {url}: {e}")
        return None

### Function for getting rid of non-debate text and organizing into a dataframe

In [128]:
def process_transcript(transcript):
    # Define a regular expression pattern to identify speaker lines
    pattern = r'^([^:]+):(.*)$'

    lines = [line.strip() for line in transcript.split('\n') if line.strip()]
    data = []
    current_speaker = None
    current_speech = []

    for line in lines:
        match = re.match(pattern, line)
        if match:
            # If a line starts with a name, store the current speaker and speech
            if current_speaker:
                data.append({"Speaker": current_speaker, "Speech": " ".join(current_speech)})
            current_speaker, speech_part = match.groups()
            current_speech = [speech_part.strip()]
        elif current_speaker:
            # If it doesn't start with a name, consider it as part of the current speech
            current_speech.append(line.strip())

    # Append the last speaker and their speech
    if current_speaker:
        data.append({"Speaker": current_speaker, "Speech": " ".join(current_speech)})

    df = pd.DataFrame(data)
    return df

In [129]:
documents = []
labels = []

for url in dem_urls:
    transcript = scrape_and_format_text(url)
    df = process_transcript(transcript)

    # Check if the DataFrame is not empty
    if not df.empty:
        documents.append(df)

# Concatenate the list of DataFrames into a single DataFrame
dem_df = pd.concat(documents, ignore_index=True)
dem_df['label'] = 'democrat'


In [130]:
documents = []
labels = []

for url in rep_urls:
    transcript = scrape_and_format_text(url)
    df = process_transcript(transcript)

    # Check if the DataFrame is not empty
    if not df.empty:
        documents.append(df)

# Concatenate the list of DataFrames into a single DataFrame
rep_df = pd.concat(documents, ignore_index=True)
rep_df['label'] = 'republican'


In [131]:
#combine dem and rep dataframes
df = pd.concat([dem_df, rep_df])

### Removing unwanted sections from debates

In [132]:
# drop rows with participants that equal 'MODERATORS' or "PARTICIPANTS" or "NOTE"
# also drop rows with www. in the speech
df = df[df.Speaker != 'MODERATORS']
df = df[df.Speaker != 'PARTICIPANTS']
df = df[df.Speaker != 'NOTE']
df = df[~df.Speech.str.contains("www.")]
df = df[df['Speaker'].str.isupper()]
df = df.reset_index(drop=True)

In [133]:
# value counts for each label
df['label'].value_counts()

democrat      5651
republican    4756
Name: label, dtype: int64

## NLP Pre-processing

In [143]:
def text_cleaner(text):
    # Convert text to lowercase
    text = text.lower()
    text = re.sub('[\r\n|\r\n]+', '\n', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub('’', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    cleaned_text = ' '.join(words)

    return cleaned_text

# Apply the clean_text function to the 'Speech' column of the DataFrame
df['clean_speech'] = df['Speech'].apply(text_cleaner)

In [151]:
# drop rows with empty clean_speech
df = df[df.clean_speech != '']
# drop rows with nan clean_speech
df = df.dropna(subset=['clean_speech'])
# drop rows with less None
df = df[df.clean_speech != 'none']
# drop rows with less than 10 characters
df = df[df.clean_speech.str.len() > 10]
# reset index
df = df.reset_index(drop=True)

In [153]:
df

Unnamed: 0,Speaker,Speech,label,clean_speech
0,TAPPER,"Good evening from Washington, D.C. And welcome...",democrat,good evening washington dc welcome unique even...
1,BASH,We come together tonight at an extraordinary t...,democrat,come together tonight extraordinary time count...
2,CALDERON,The setting of this debate is also different. ...,democrat,setting debate also different reduce unnecessa...
3,TAPPER,"And all of this comes, of course, as four more...",democrat,come course four state florida arizona ohio il...
4,BIDEN,"Well, first of all, my heart goes out to those...",democrat,well first heart go already lost someone suffe...
...,...,...,...,...
9117,BUSH,Here's what I believe. I believe we're at the ...,republican,here believe believe verge greatest time alive...
9118,BAIER,"Mr. Trump, closing statement, sir.",republican,mr trump closing statement sir
9119,TRUMP,Our country is in serious trouble. We don't wi...,republican,country serious trouble dont win anymore dont ...
9120,BAIER,"Gentlemen, thank you.",republican,gentleman thank


In [156]:
# create a final dataframe with only the clean_speech and label columns
df_final = df[['clean_speech', 'label']]

In [162]:
#export as csv file
df_final.to_csv('debates.csv', index=False)