# Determining Queer-Affirming vs non-Affirming Churches: A Web Scraping and Text Analysis Project

**Project Description**:  
The goal of this project is to develop a comprehensive database of churches in Australia, where each church is flagged as being queer-affirming or not. This will provide more clarity to queer individuals looking for places of worshop where they will accepted and treated equally. 

**Goals**:  
1. Extract relevant textual information from church websites.  
2. Clean and preprocess the text data for analysis.  
3. Identify patterns, keywords, and indicators of queer-affirming vs non-affirming language.  
4. Flag each church as 'affirming", "non-affirming", or "unknown". 

**Author**:  
Travis Rutledge, travisrutledge@gmail.com

**Last Updated**:  
03/02/2025

**Notebook Outline**:  
1. Collecting a List of Australian Churches
2. URL Text Scraping
2. Data Cleaning  
3. Text Preprocessing  
4. Sentiment Analysis  
5. Results

# Libraries

In [7]:
import os # Allows to intract with operating system, such as environment variables and system commands
import requests # Allows for http requests
from bs4 import BeautifulSoup # HTML web scraping tool
from urllib.parse import urljoin, urlparse # Manipulates URLs for better web scraping 
import pandas as pd # Data cleaning and transformation
import nltk
#nltk.download('all')
from nltk.tokenize import word_tokenize # Separates text into individual words 
from nltk.corpus import stopwords # Filters out common words like 'the' and 'is' 
from nltk.util import ngrams # Creates bigrams, which are used for sentimental analysis
from collections import Counter # Counts the number of unique words in each data set
from textblob import TextBlob # Sentiment analysis 
import matplotlib.pyplot as plt # data exploration and visualisation

# 1. Collecting a list of Australian Churches
The first step is to gather a list of Australian churches, which includes their names, URL, address, and coordinates. This is done by using the Google Places API, which allows for a 1,000 search requests per month. Each request can return up to 20 places. Each query has a maximum squre range of 50km, so 50km coordinate squares will need to be determined in order to facilitate a comprehensive search of churches across Australia. 

## Search Parameters

In [8]:
# API 
API_KEY = os.getenv("GOOGLE_PLACE_API_KEY")

# Search parameters, adjust as needed
locations = [
# Victoria
#"-37.840935,144.946457",  # Melbourne
#"-36.616619,143.260361",  # St Arnaud
#"-37.292870,144.951263",  # Kilmore
#"-37.987461,145.214859",  # Dandenong
#"-36.124428,146.876389",  # Wodonga
#"-37.966667,144.133331",  # Lethbridge
#"-37.823002,144.998001",  # Richmond
#"-37.636604,144.806427",  # Bulla
#"-36.948254,145.104477",  # Northwood
#"-37.020100,145.131454",  # Seymour
# New South Wales
#"-33.865143,151.209900",  # Sydney
#"-30.640720,151.500702",  # Uralla
#"-34.033749,151.071198",  # Kirrawee
#"-30.452242,152.897964",  # Bellingen
#"-32.569473,151.178818",  # Singleton
#"-33.968109,151.104080",  # Hurstville
#"-30.614943,152.852127",  # Bowraville
#"-34.673820,150.844376",  # Kiama
#"-32.023331,151.958755",  # Gloucester
#"-32.897633,151.736984",  # Mayfield
#"-28.175995,153.541672",  # Tweed Heads
#"-33.084999,151.634995",  # Swansea
#"-34.583332,150.866669",  # Shellharbour
#"-32.916668,151.750000",  # Newcastle
#"-33.794498,150.976501",  # Constitution Hill
#"-33.932999,151.259003",  # South Coogee
#"-33.485867,149.667435",  # Brewongle
#"-34.443371,150.061356",  # Bermagui
#"-34.033749,151.071198",  # Kirrawee
# Queensland
"-27.4678,153.0281",      # Brisbane
"-19.007626,146.189194",  # Paluma
"-25.898890,139.351669",  # Birdsville
"-23.439493,144.251389",  # Longreach
"-19.568516,147.406387",  # Ayr
"-27.066668,152.966660",  # Caboolture
"-20.267500,148.716949",  # Airlie Beach
"-27.616667,152.850006",  # Collingwood Park
"-27.454914,153.007126",  # Red Hill
"-26.798412,153.132965",  # Caloundra
"-27.395847,152.937881",  # Ferny Hills
"-27.585613,152.983658",  # Durack
"-27.302221,152.988815",  # Strathpine
"-28.000767,153.429642",  # Surfers Paradise
]
radius = 25000  # Search radius in meters
type = ["church", "place_of_worship"] # Type of place to search for

## Search Execution

In [18]:
# API URLs
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
details_base_url = "https://maps.googleapis.com/maps/api/place/details/json"

def get_places_data(api_key, location, radius, type):
    all_results = []  # To store all results across pages
    params = {
        "key": api_key,
        "location": location,
        "radius": radius,
        "type": type
    }
    
    while True:
        # Make the API request
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        # Parse the response
        data = response.json()
        print(f"API Response: {data}")  # Print full response to debug
        all_results.extend(data.get("results", []))  # Add results from the current page
        
        # Check if there is a next page
        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break  # No more pages, exit loop
        
        # Wait a few seconds before using the next_page_token to avoid request denial
        import time
        time.sleep(5)  # Google requires a short delay before using the next page token
        
        print(f"Fetched {len(data.get('results', []))} results")
        print(f"Next Page Token: {next_page_token}")

        # Update params with the next_page_token
        params.update({"pagetoken": next_page_token})
    
        if "status" in data and data["status"] == "OVER_QUERY_LIMIT":
            print("Google is rate-limiting you. Try again later.")
            break  # Stop fetching if limit is reached

    return all_results


# Function to get website URLs from Place Details API
def get_place_details(api_key, place_id):
    params = {
        "place_id": place_id,
        "fields": "website",  # Request only the website field
        "key": api_key
    }
    response = requests.get(details_base_url, params=params)
    if response.status_code == 200:
        details_data = response.json()
        return details_data.get("result", {}).get("website", "N/A")
    else:
        print(f"Error fetching details for place_id {place_id}: {response.status_code}")
        return "N/A"

# Function to extract URLs
def extract_website_urls(data, api_key):
    websites = []
    for result in data:
        place_id = result.get("place_id")
        website = get_place_details(api_key, place_id)
        if website and website != "N/A":
            websites.append(website)
    return websites

# Main script to collect the list of church websites
if __name__ == "__main__":
    all_websites = []  # To store all websites from multiple locations

    # Loop through locations
    for location in locations:
        print(f"Fetching data for location: {location}")
        # Fetch data from Google Places API with pagination
        places_data = get_places_data(API_KEY, location, radius, type)
        
        if places_data:
            # Extract website URLs
            website_urls = extract_website_urls(places_data, API_KEY)
            all_websites.extend(website_urls)

    # Remove duplicates by converting to a set and back to a list
    all_websites = list(set(all_websites))
    
    # Save to a text file or use it for further steps
    with open("church_websites.txt", "w") as f:
        for url in all_websites:
            f.write(url + "\n")
    
    print(f"Website URLs saved to 'church_websites.txt'")

Fetching data for location: -27.4678,153.0281
API Response: {'html_attributions': [], 'next_page_token': 'AVzFdbk3XzY-C7E10UXAgV601-ZkvcLMzgjS8SDg9c4rKcqQ4hHaV1ijoiAACKFXh09bMeywUsr47grUOK1v_uNneH1QDSajAZxsK1pKindaTW1lUVeoV0-uOxAysedod5YvjJxVX-9Oq5pU6jahCV9bl1YZ3YHrKQG-aM1un7BSOIYc_AIX0KsrRj-HRaJwf15F4xuOciRM0E1SkCJAhf05YbeNFv0P6S8oJDwRgZY30pOtynx2ATS_tP9hXZiiHrS3poZiYVLQ0XJ1zYEA9h_Ec8W1iRCQ2M0UT3L5IFDnzXFYAeQy_BO_mCwLKsm4TMf8orZA1VaBLO9gJsB-oJcxMxM8BuUHOImSk5vrUfU3ZLMa0GuFOSEGkxv3okzxD2-l_IYKIxrliN82T5B2gee2qYBpZVDW_miUdV30WjIkcWiFhXmMwYOusMOqei__HA', 'results': [{'business_status': 'OPERATIONAL', 'formatted_address': '346 George St, Brisbane City QLD 4000', 'geometry': {'location': {'lat': -27.4691565, 'lng': 153.0216518}, 'viewport': {'northeast': {'lat': -27.46778027010727, 'lng': 153.0230370298927}, 'southwest': {'lat': -27.47047992989272, 'lng': 153.0203373701073}}}, 'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/worship_general-71.png', 'icon_background_col

# 2. URL Text Scraping
Using the chuch URLs gathered in the previous step, data is scraped from each church's website. Data is broken down by website, webpage, and text content. Because each website can have numerous pages, the scraping tool only scrapes websites that are found in the top navigation of each HTML website.

To Do:
    Another web scraping tool such as Selinium or something else will need to be used to scrape from javascript websites. The current setup only scrapes HTML websites. 

## Playwright Scraping

## Beautiful Soup HMTL Scraping

In [10]:
# 2. URL Text Scraping
# Load the list of church websites generated in Section 1
websites = []

# Read the URLs from the file created in Section 1
with open("church_websites.txt", "r") as f:
    websites = [line.strip() for line in f.readlines()]

# Define scraping parameters
MAX_DEPTH = 1  # Maximum depth of scraping
SKIP_PATTERNS = ["tribe_events", "eventDisplay", "ical", "page", "eventDate"]  # Patterns to skip

# Normalize URL function
from urllib.parse import urlparse, urlunparse, urljoin

def normalize_url(url):
    parsed = urlparse(url)
    normalized = urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", "", ""))
    return normalized

# Function to validate URLs
def is_valid_url(url):
    try:
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)
    except:
        return False

def scrape_stories(soup):
    # Check for story-like structure
    story_container = soup.find('div', class_='masonry-grid')  # Adjust if needed
    if not story_container:
        return None  # No stories found

    # Extract stories
    stories = story_container.find_all('div', class_='masonry-grid-item')
    story_list = []
    for story in stories:
        title = story.find('b').get_text(strip=True) if story.find('b') else 'No Title'
        description = story.find('i').get_text(strip=True) if story.find('i') else 'No Description'
        link = story.find('a')['href'] if story.find('a') else 'No Link'
        story_list.append({
            'title': title,
            'description': description,
            'link': link
        })
    return story_list

def scrape_default(soup, current_url):
    # Extract main content
    content_div = soup.find("div", {"class": "main-content"})
    if not content_div:
        content_div = soup.find("body")  # Fallback to entire body
    page_text = content_div.get_text(separator=" ", strip=True) if content_div else ""

    # Clean up text
    page_text = page_text.replace("Read More", "").strip()
    return {"URL": current_url, "Text": page_text}

def scrape_website(start_url, max_depth=MAX_DEPTH):
    visited = set()  # Keep track of visited URLs
    to_visit = [(start_url, 0)]  # Start with the homepage and depth level 0
    data = []  # To store scraped data

    while to_visit:
        current_url, depth = to_visit.pop(0)

        # Skip if the maximum depth is exceeded
        if depth > max_depth:
            continue

        # Skip if already visited
        normalized_url = normalize_url(current_url)
        if normalized_url in visited:
            continue

        print(f"Visiting: {current_url}")
        try:
            # Fetch the URL
            response = requests.get(current_url, timeout=10)
            response.raise_for_status()

            # Check the content type
            content_type = response.headers.get("Content-Type", "")
            if "text/html" not in content_type:
                print(f"Skipping non-HTML content: {current_url}")
                continue

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {current_url}: {e}")
            continue

        # Mark as visited
        visited.add(normalized_url)

        # Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Try scraping stories
        stories = scrape_stories(soup)
        if stories:
            print(f"Stories found on {current_url}!")
            for story in stories:
                data.append({"Website": start_url, "URL": current_url, **story})
            continue  # Skip link traversal for story pages

        # Fallback to default scraping logic
        page_data = scrape_default(soup, current_url)
        data.append({"Website": start_url, **page_data})

        # Extract and filter links only from navigation elements
        nav_links = soup.find_all(['nav', 'ul'])
        links = [urljoin(current_url, a['href']) for nav in nav_links for a in nav.find_all('a', href=True)]

        for link in links:
            normalized_link = normalize_url(link)
            if start_url in normalized_link and normalized_link not in visited:
                to_visit.append((link, depth + 1))  # Add the next depth level

    return data

# Scrape each website
if __name__ == "__main__":
    all_data = []  # To store all scraped data

    total_websites = len(websites)  # Get the total number of websites
    for idx, start_url in enumerate(websites, start=1):  # Use enumerate to track the index
        print(f"Scraping {idx} of {total_websites} websites: {start_url}")
        scraped_data = scrape_website(start_url)
        all_data.extend(scraped_data)

    # Convert the scraped data to a pandas DataFrame
    df = pd.DataFrame(all_data)

    # Validate URLs
    df["URL"] = df["URL"].apply(lambda x: x if is_valid_url(x) else None)

    # Drop rows where URL is invalid
    df = df.dropna(subset=["URL"])
    print("Invalid URLs have been removed from the dataset.")

    # Filter out rows with empty text
    df_cleaned = df[df["Text"].notnull() & (df["Text"].str.strip() != "")]
    print("Cleaned DataFrame:")
    print(df_cleaned.head())

    # Save cleaned data to CSV
    df_cleaned.to_csv("scraped_data.csv", index=False, encoding="utf-8")
    print("Scraped data saved to 'scraped_data.csv'")

Scraping 1 of 309 websites: https://local.churchofjesuschrist.org/en/au/qld/mudgeeraba/74-mudgeeraba-road?utm_source=gmb&utm_medium=yext&y_source=1_MzAwNjI0OTgtNzE1LWxvY2F0aW9uLndlYnNpdGU%3D
Visiting: https://local.churchofjesuschrist.org/en/au/qld/mudgeeraba/74-mudgeeraba-road?utm_source=gmb&utm_medium=yext&y_source=1_MzAwNjI0OTgtNzE1LWxvY2F0aW9uLndlYnNpdGU%3D
Scraping 2 of 309 websites: http://www.hp.gracebible.org.au/
Visiting: http://www.hp.gracebible.org.au/
Scraping 3 of 309 websites: https://newlifechurch.com.au/
Visiting: https://newlifechurch.com.au/
Scraping 4 of 309 websites: http://www.mychurchbrisbane.com/
Visiting: http://www.mychurchbrisbane.com/
Failed to fetch http://www.mychurchbrisbane.com/: 406 Client Error: Not Acceptable for url: http://www.mychurchbrisbane.com/
Scraping 5 of 309 websites: http://www.allsaintsbrisbane.com/
Visiting: http://www.allsaintsbrisbane.com/
Visiting: http://www.allsaintsbrisbane.com/services
Visiting: http://www.allsaintsbrisbane.com/peop

# 3. Data Cleaning
This step focuses on deleting irrelevent data and correcting any inconsistencies or special characters. This code chunk lists the number of duplicate rows before and after removal, and it provides a .csv file of the cleaned data at the end

In [11]:
# Remove rows with missing or empty text
df_cleaned = df[df["Text"].notnull() & (df["Text"].str.strip() != "")]

# Remove duplicates
# Check for duplicate rows based on all columns
print(f"Number of duplicate rows before removal: {df.duplicated().sum()}")

# Drop duplicates
df_cleaned = df.drop_duplicates()

# Confirm duplicates were removed
print(f"Number of duplicate rows after removal: {df_cleaned.duplicated().sum()}")

# Handle encoding issues
# Define a function to clean special characters and encoding issues
def clean_encoding_issues(text):
    if isinstance(text, str):
        # Replace or remove problematic characters
        text = text.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')
        # Optionally: replace specific problematic sequences
        text = text.replace("â€™", "'")  # Replace right single quotation mark
        text = text.replace("â€œ", '"')  # Replace left double quotation mark
        text = text.replace("â€", '"')  # Replace right double quotation mark
        text = text.replace("â€“", "-")  # Replace en dash
        text = text.replace("â€¦", "...")  # Replace ellipsis
        text = text.replace("â€‹", "")  # Remove zero-width space
    return text

# Apply the cleaning function to the "Text" column
df_cleaned["Text"] = df_cleaned["Text"].apply(clean_encoding_issues)

# Save cleaned data to CSV
df_cleaned.to_csv("cleaned_data.csv", index=False, encoding="utf-8")

print("Data cleaning complete. Cleaned data saved to 'cleaned_data.csv'.")

Number of duplicate rows before removal: 0
Number of duplicate rows after removal: 0
Data cleaning complete. Cleaned data saved to 'cleaned_data.csv'.


## 3a. Cleaning, Tokenising, and Bigramming
This section sets all the text to lower case, tokenizes the text (breaking it into individual words), removes extranous words like "this" and "is", removes anything that is not a letter or a number, and generates bigrams for each entry

In [12]:
# Ensure all entries in the "Text" column are strings
df_cleaned["Text"] = df_cleaned["Text"].fillna("").astype(str)

# Lowercase the text
df_cleaned["Text"] = df_cleaned["Text"].str.lower()

# Tokenize the text (breaks the text into individual words)
df_cleaned["Tokens"] = df_cleaned["Text"].apply(word_tokenize)

# Remove stopwords (like 'the' and 'is')
stop_words = set(stopwords.words("english"))
df_cleaned["Tokens"] = df_cleaned["Tokens"].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# Remove anything that is not a letter or number, like punctuation and special characters
df_cleaned["Tokens"] = df_cleaned["Tokens"].apply(
    lambda tokens: [word for word in tokens if word.isalnum()]
)

# Remove illegal characters using pandas' built-in string functions
df_cleaned = df_cleaned.map(lambda x: ''.join(filter(lambda y: y.isprintable(), str(x))) if isinstance(x, str) else x)

# Generate bigrams for each text entry
df_cleaned["Bigrams"] = df_cleaned["Tokens"].apply(
    lambda tokens: list(ngrams(tokens, 2))
)

# Generate trigrams for each text entry
df_cleaned["Trigrams"] = df_cleaned["Tokens"].apply(
    lambda tokens: list(ngrams(tokens, 3))
)

# 4. Text Preprocessing

## 4a. Keyword-Based Classification
These are the keywords and bigrams that will be used to flag if a church is affirming or non-affirming. These are broken out by keywords, bigrams, and trigrams. A separate variable is made to flag certain denominations that are affirming or nonaffirming. FOr example, the uniting church of australia is affirming while the Australian Christian Church is nonaffirming

In [16]:
# Affirming
affirming_keywords = ["lgbt", "lgbtqia", "lbgtq+", "lgbtqia+", "queer", "bisexual", "lbgti"]

affirming_bigrams = [("sexual", "orientation"), ("gender", "identity"), ("queer", "affirming")]

affirming_movements = [("uniting")]

affirming_movements_bigrams = ["uniting", "church"]

# Nonaffirming
non_affirming_keywords = ["homosexuality", "ssa"]

non_affirming_bigrams = [("traditional", "marriage"), ("biblical","values"),("god's","design"),
("husband","wife"), ("same", "sex"), ("sex", "attraction"), ("biological", "sex"), ("biblical", "view"),
("institution", "marriage"), ("husbands", "wives"), ("one", "woman")]

non_affirming_trigrams = [("biblical", "view", "marriage"), ("same", "sex", "attracition")]

non_affirming_movements = ["acc", "www.acc.org.au", "hillsong", "presbyterian.org.au", "presbyterian"]

non_affirming_movements_trigrams = [("australian", "christian", "churches"), ("australian", "christian", "church")]

## 4b. Flagging Movements/Networks

In [17]:
# Network flagging function to include evidence
def flag_affirming_or_non_affirming_network_with_evidence(row):
    affirming_movement_evidence = []
    non_affirming_movement_evidence = []

    # Check for affirming movements/networks
    for word in row["Tokens"]:
        if word in affirming_movements:
            affirming_movement_evidence.append(word)
    for bigram in row["Bigrams"]:
        if bigram in affirming_movements_bigrams:
            affirming_movement_evidence.append(" ".join(bigram))
    
    # Check for non-affirming movements/networks
    for word in row["Tokens"]:
        if word in non_affirming_movements:
            non_affirming_movement_evidence.append(word)
    for trigram in row["Trigrams"]:
        if trigram in non_affirming_movements_trigrams:
            non_affirming_movement_evidence

    # Determine the flag and evidence
    if affirming_movement_evidence:
        return "Affirming", ", ".join(affirming_evidence)  # Affirming with evidence
    elif non_affirming_movement_evidence:
        return "Non-Affirming", ", ".join(non_affirming_evidence)  # Non-Affirming with evidence
    else:
        return "Unknown", ""  # Neither found
    
# Apply the updated function
df_cleaned[["AffirmingFlag", "Evidence"]] = df_cleaned.apply(
    lambda row: pd.Series(flag_affirming_or_non_affirming_network_with_evidence(row)), axis=1
)

NameError: name 'affirming_evidence' is not defined

## 4c. Flagging Affirming and Non-Affirming Language

In [6]:
# Flagging function to include evidence
def flag_affirming_or_non_affirming_with_evidence(row):
    affirming_evidence = []
    non_affirming_evidence = []

    # Check for affirming keywords or bigrams
    for word in row["Tokens"]:
        if word in affirming_keywords:
            affirming_evidence.append(word)
    for bigram in row["Bigrams"]:
        if bigram in affirming_bigrams:
            affirming_evidence.append(" ".join(bigram))
    
    # Check for non-affirming keywords, bigrams, or trigrams
    for word in row["Tokens"]:
        if word in non_affirming_keywords:
            non_affirming_evidence.append(word)
    for bigram in row["Bigrams"]:
        if bigram in non_affirming_bigrams:
            non_affirming_evidence.append(" ".join(bigram))
    for trigram in row["Trigrams"]:
        if trigram in non_affirming_trigrams:
            non_affirming_evidence.append(" ".join(trigram))
    
    # Determine the flag and evidence
    if affirming_evidence:
        return "Affirming", ", ".join(affirming_evidence)  # Affirming with evidence
    elif non_affirming_evidence:
        return "Non-Affirming", ", ".join(non_affirming_evidence)  # Non-Affirming with evidence
    else:
        return "Unknown", ""  # Neither found

# Apply the updated function
df_cleaned[["AffirmingFlag", "Evidence"]] = df_cleaned.apply(
    lambda row: pd.Series(flag_affirming_or_non_affirming_with_evidence(row)), axis=1
)

# Save to CSV
df_cleaned.to_csv("scraped_cleaned_data_with_evidence.csv", index=False, encoding = "utf-8")

print("Updated dataset with evidence, saved to 'scraped_cleaned_data_with_evidence.csv'.")

# Filter out rows with "Unknown" in the AffirmingFlag column
filtered_df = df_cleaned[df_cleaned["AffirmingFlag"] != "Unknown"]

# Save to CSV
filtered_df.to_csv("scraped_cleaned_data_with_evidence_no_unknowns.csv", index=False, encoding="utf-8")

print("Updated dataset with evidence and removed unknowns, saved to 'scraped_cleaned_data_with_evidence.csv_no_unknowns'.")

#save to xlsx 
df_cleaned.to_excel("scraped_cleaned_data_with_evidence.xlsx", index=False, engine="openpyxl")

NameError: name 'df_cleaned' is not defined

## 4c. Exploratory Text Analysis

### 4c1. Top 20 Keywords

In [None]:
# Flatten the list of tokens into a single list
all_tokens = [token for tokens in df_cleaned["Tokens"] for token in tokens]

# Count word frequencies
word_counts = Counter(all_tokens)

# Get the top 20 most common words
top_words = word_counts.most_common(20)

# Display as a bar chart
words, counts = zip(*top_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xticks(rotation=45)
plt.title("Top 20 Most Common Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.show()

### 4c2. Top 20 Bigrams

In [None]:
from collections import Counter

# Flatten the list of bigrams into a single list
all_bigrams = [bigram for bigrams in df_cleaned["Bigrams"] for bigram in bigrams]

# Count bigram frequencies
bigram_counts = Counter(all_bigrams)

# Get the top 20 most common bigrams
top_bigrams = bigram_counts.most_common(20)

# Display as a bar chart
bigrams, counts = zip(*top_bigrams)
bigram_labels = [' '.join(bigram) for bigram in bigrams]

plt.figure(figsize=(12, 6))
plt.bar(bigram_labels, counts)
plt.xticks(rotation=45, ha="right")
plt.title("Top 20 Most Common Bigrams")
plt.xlabel("Bigrams")
plt.ylabel("Frequency")
plt.show()


## 5. Sentiment Analysis

## Unsupervised Learning

### Basic Flagging

In [None]:
# Sentiment analysis function
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score: >0 positive, <0 negative, 0 neutral

# Apply sentiment analysis to cleaned text
df_cleaned["SentimentScore"] = df_cleaned["Text"].apply(get_sentiment)

# Categorize rows based on sentiment score
def categorize_sentiment(score):
    if score > 0.2:  # Adjust thresholds as needed
        return "Likely Affirming"
    elif score < -0.2:
        return "Likely Non-Affirming"
    else:
        return "Neutral/Unknown"

df_cleaned["SentimentCategory"] = df_cleaned["SentimentScore"].apply(categorize_sentiment)

# Combine sentiment with affirming keyword flag for a final assessment
def final_flag(row):
    if row["AffirmingFlag"]:
        return "Affirming"
    elif row["SentimentCategory"] == "Likely Non-Affirming":
        return "Non-Affirming"
    else:
        return "Unknown"

df_cleaned["FinalFlag"] = df_cleaned.apply(final_flag, axis=1)

# Save updated DataFrame to CSV
output_file = "scraped_cleaned_with_sentiment.csv"
df_cleaned.to_csv(output_file, index=False, encoding="utf-8")

print(f"Results saved to {output_file}")


### TF-IDF and K-Means Clustering

In [None]:
# Load the data
df = pd.read_csv("scraped_cleaned_data.csv")

# Combine Tokens and Bigrams into a single column for TF-IDF
df["Combined"] = df["Tokens"].apply(lambda x: " ".join(eval(x))) + " " + \
                 df["Bigrams"].apply(lambda x: " ".join(["_".join(bigram) for bigram in eval(x)]))

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    stop_words=None,    # No need for stop words, as Tokens are already cleaned
    ngram_range=(1, 1)  # Focus on unigrams (and bigrams if added earlier)
)

# Apply TF-IDF to the combined Tokens and Bigrams column
X_tfidf = tfidf.fit_transform(df["Combined"])

# View feature names for reference
print(f"Number of TF-IDF Features: {len(tfidf.get_feature_names_out())}")

from sklearn.cluster import KMeans

# Initialize and fit K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust `n_clusters` as needed
df["Cluster"] = kmeans.fit_predict(X_tfidf)

# Analyze cluster distribution
print(df["Cluster"].value_counts())

# Save clusters or predictions back to the dataset
df.to_csv("updated_church_data.csv", index=False)

# Save TF-IDF matrix for future use
from scipy.sparse import save_npz
save_npz("tfidf_matrix.npz", X_tfidf)



## Supervised Learning

# Results
This section collapses the results of the sentiment analysis for each website. The end result is a table where each row is an individual church with their name, website, address, coordinates, whether they are affirming or non-affirming, and the evidence (a url) where they state their affirmation or non-affirmation.

# Misc Code

In [None]:
"""
# Testing if there are church's without their type set as church
# API URLs
base_url = "https://maps.googleapis.com/maps/api/place/textsearch/json"
details_base_url = "https://maps.googleapis.com/maps/api/place/details/json"

# Function to get data from Google Places API with pagination
def get_places_data(api_key, query, location, radius, type):
    all_results = []  # To store all results across pages
    params = {
        "key": api_key,
        "query": query, 
        "location": location,
        "radius": radius,
    }
    
    while True:
        # Make the API request
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        # Parse the response
        data = response.json()
        all_results.extend(data.get("results", []))  # Add results from the current page
        
        # Check if there is a next page
        next_page_token = data.get("next_page_token")
        if not next_page_token:
            break  # No more pages, exit loop
        
        # Wait a few seconds before using the next_page_token to avoid request denial
        import time
        time.sleep(3)  # Google requires a short delay before using the next page token
        
        print(f"Fetched {len(data.get('results', []))} results")
        print(f"Next Page Token: {next_page_token}")

        # Update params with the next_page_token
        params.update({"pagetoken": next_page_token})
    
    return all_results

# Function to get website URLs from Place Details API
def get_place_details(api_key, place_id):
    params = {
        "place_id": place_id,
        "fields": "website,types",
        "key": api_key
    }
    response = requests.get(details_base_url, params=params)
    
    if response.status_code == 200:
        details_data = response.json().get("result", {})
        return details_data.get("website", "N/A"), details_data.get("types", ["N/A"])
    else:
        print(f"Error fetching details for place_id {place_id}: {response.status_code}")
        return "N/A", ["N/A"]

# Main script
if __name__ == "__main__":
    results = []

    for location in locations:
        print(f"Fetching data for location: {location}")
        places_data = get_places_data(API_KEY, query, location, radius, type)

        for place in places_data:
            name = place.get("name", "Unknown")
            place_id = place.get("place_id")
            
            if place_id:
                website, place_types = get_place_details(API_KEY, place_id)
                place_types_str = ", ".join(place_types)  # Convert list to string for better readability
                
               # Store all results (NO filtering)
                results.append([name, website, place_types_str])

      # Convert results to DataFrame
    df = pd.DataFrame(results, columns=["Church Name", "Website", "Types"])
    
    # Print results as a table in console
    print(df.to_string(index=False))

    # Save to CSV file
    df.to_csv("churches_without_church_type.csv", index=False)
    
    print("Results saved to 'churches_without_church_type.csv'")
    """