<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database_From_Table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Cell 1: Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import os
from google.colab import userdata

In [11]:
# Cell 2: Clone GitHub repository and configure Git


# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

Cloning into 'USCCB'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects:   0% (1/123)[Kremote: Counting objects:   1% (2/123)[Kremote: Counting objects:   2% (3/123)[Kremote: Counting objects:   3% (4/123)[Kremote: Counting objects:   4% (5/123)[Kremote: Counting objects:   5% (7/123)[Kremote: Counting objects:   6% (8/123)[Kremote: Counting objects:   7% (9/123)[Kremote: Counting objects:   8% (10/123)[Kremote: Counting objects:   9% (12/123)[Kremote: Counting objects:  10% (13/123)[Kremote: Counting objects:  11% (14/123)[Kremote: Counting objects:  12% (15/123)[Kremote: Counting objects:  13% (16/123)[Kremote: Counting objects:  14% (18/123)[Kremote: Counting objects:  15% (19/123)[Kremote: Counting objects:  16% (20/123)[Kremote: Counting objects:  17% (21/123)[Kremote: Counting objects:  18% (23/123)[Kremote: Counting objects:  19% (24/123)[Kremote: Counting objects:  20% (25/123)[Kremote: Counting objects:  21% (26/123

In [12]:
# Cell 3: Fetch the webpage content
url = "https://archatl.com/parishes/parish-directory/"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [13]:
# Cell 4: Extract the table data
table = soup.find('table', {'id': 'table_1'})
rows = table.find_all('tr')

data = []
for row in rows[1:]:  # Skip the header row
    cols = row.find_all('td')
    row_data = [col.text.strip() for col in cols[:-1]]  # Extract all columns except the last one

    # Extract the hyperlink from the last column
    web_col = cols[-1]
    link = web_col.find('a')
    if link:
        row_data.append(link.get('href'))
    else:
        row_data.append('')

    data.append(row_data)

In [14]:
# Cell 5: Create a DataFrame
columns = ['Name', 'Status', 'Deanery', 'EST', 'Street Address', 'City', 'State', 'Zipcode', 'Phone Number', 'Web']
df = pd.DataFrame(data, columns=columns)

In [15]:
# Cell 6: Create SQLite database and store the data
conn = sqlite3.connect('data.db')
df.to_sql('parishes', conn, if_exists='replace', index=False)

116

In [16]:
# Cell 7: Verify the data in the database
query = "SELECT * FROM parishes LIMIT 5"
result = pd.read_sql_query(query, conn)
print(result)

# Close the database connection
conn.close()

print("Data extraction and storage complete.")

                           Name           Status          Deanery   EST  \
0                    All Saints           Parish      North Metro  1977   
1         Capilla Santo Domingo  Pastoral Center        Northeast  2014   
2        Capilla San Juan Diego  Pastoral Center        Northwest         
3  Cathedral of Christ the King        Cathedral          Central  1936   
4               Christ Our Hope           Parish  Northeast Metro  1984   

            Street Address      City State Zipcode  Phone Number  \
0   2443 Mount Vernon Road  Dunwoody    GA   30338  770-393-3255   
1          427 Cash Street  Cornelia    GA   30531  706-754-4518   
2        1609 E Morris St.    Dalton    GA   30720  706-278-3107   
3  2699 Peachtree Road, NE   Atlanta    GA   30305  404-233-2145   
4       1786 Wellborn Road  Lithonia    GA   30058  770-482-5017   

                                 Web  
0  http://www.allsaintsdunwoody.org/  
1                                     
2                      

In [17]:
# Cell 8: Commit changes and push to GitHub
# Add changes to git
!git add data.db

# Commit changes
!git commit -m "Added data to  data.db using Build_Parishes_Database_From_Table.ipynb"

# Push changes to GitHub
!git push origin main

[main fe5a3b6] Added data to  data.db using Build_Parishes_Database_From_Table.ipynb
 1 file changed, 0 insertions(+), 0 deletions(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 381 bytes | 381.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/tomknightatl/USCCB.git
   0b2aca4..fe5a3b6  main -> main
