In [64]:
import requests
from bs4 import BeautifulSoup
import json
import re
from typing import Dict, List
import pandas as pd
from pathlib import Path
import os

In [None]:
# Base configuration
BASE_URL = "https://wisconsinsupperclubs.com"
REGIONS = {
    "southeast": "/southeast-region/",
    "southwest": "/southwest-region/", 
    "northeast": "/northeast-region/",
    "north-central": "/north-central-region/",
    "northwest": "/northwest-region/"
}

In [68]:
def clean_address(text: str) -> str:
    """Clean address text by removing extra whitespace and HTML tags"""
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'<.*?>', '', text)
    return text

def parse_club(div, region: str) -> Dict:
    """Parse information for a single supper club"""
    club_data = {
        'region': region  # Add region information
    }
    
    # Get club name
    title = div.find('h2', class_='fl-post-grid-title')
    if title:
        club_data['name'] = title.text.strip()
        
        # Get club detail page URL
        link = title.find('a')
        if link:
            club_data['url'] = link.get('href', '')
    
    # Get address and contact information
    content = div.find('div', class_='fl-post-grid-content')
    if content and content.find('p'):
        address_text = content.find('p').get_text().split('\n')
        address_text = [line.strip() for line in address_text if line.strip()]
        
        # Remove website link line but save website URL
        if len(address_text) > 1:
            website = content.find('a')
            if website:
                club_data['website'] = website.get('href', '')
            address_text = address_text[1:]
        
        # Parse address components
        if len(address_text) >= 1:
            club_data['address'] = address_text[0]
        
        if len(address_text) >= 2:
            city_state_zip = address_text[1].split()
            if len(city_state_zip) >= 2:
                club_data['city'] = ' '.join(city_state_zip[:-2]) 
                club_data['state'] = city_state_zip[-2]
                club_data['zip_code'] = city_state_zip[-1]
        
        if len(address_text) >= 3:
            club_data['phone'] = address_text[2]
    
    return club_data

def scrape_region(region: str, url: str) -> List[Dict]:
    """Scrape all supper clubs from a specific region"""
    print(f"Scraping {region} region...")
    clubs = []
    
    try:
        # Send HTTP request
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all supper club blocks
        club_divs = soup.find_all('div', class_='fl-post-grid-post')
        
        # Parse each club's information
        for div in club_divs:
            club_data = parse_club(div, region)
            if club_data:
                clubs.append(club_data)
                print(f"Parsed: {club_data['name']}")
        
        print(f"Found {len(clubs)} supper clubs in {region} region")
        return clubs
        
    except Exception as e:
        print(f"Error scraping {region}: {str(e)}")
        return []

def scrape_all_regions_to_json() -> List[Dict]:
    """Scrape supper clubs from all regions and return as a single list"""
    all_clubs = []
    
    for region, path in REGIONS.items():
        url = BASE_URL + path
        clubs = scrape_region(region, url)
        all_clubs.extend(clubs)
        
        # Add delay between requests
        time.sleep(2)
    
    # Save all clubs as a single list
    with open('data/wi_supper_clubs_2023.json', 'w', encoding='utf-8') as f:
        json.dump(all_clubs, f, ensure_ascii=False, indent=2)
    
    return all_clubs

In [25]:
print("Starting to scrape Wisconsin supper clubs information...")
clubs = scrape_all_regions_to_json()

# Print statistics
print(f"\nTotal clubs found: {len(clubs)}")

# Print count by region
region_counts = {}
for club in clubs:
    region = club['region']
    region_counts[region] = region_counts.get(region, 0) + 1

print("\n=== Clubs by Region ===")
for region, count in region_counts.items():
    print(f"{region}: {count} clubs")

print("\nData saved to supper_clubs.json")

Starting to scrape Wisconsin supper clubs information...
Scraping southeast region...
Parsed: 5 O’Clock Club
Parsed: Alpine Retreat
Parsed: Bass Bay Brewhouse
Parsed: Bayside Supper Club
Parsed: Blue Heron Supper Club
Parsed: Brittain House Supper Club
Parsed: Butler Inn of Pewaukee
Parsed: Cantafio’s Buckhorn Steakhouse
Parsed: Clifford’s Supper Club
Parsed: Colony House **
Parsed: Copper Dock, The *†
Parsed: Corner House Supper Club
Parsed: Diamond Jim’s Steakhouse *†
Parsed: Duck Inn, The *†
Parsed: Edgewater Supper Club **
Parsed: Feil’s Supper Club
Parsed: Fitzgerald’s Genoa Junction **
Parsed: Five O’Clock Steakhouse **
Parsed: Great Outdoors Supper Club †
Parsed: HobNob *†
Parsed: House of Gerhard
Parsed: Iron Ridge Inn
Parsed: Jack Pandl’s Whitefish Bay Inn
Parsed: Jackson Grill, The
Parsed: Jail House, The
Parsed: Joey Gerard’s Supper Club
Parsed: Johnny Manhattan’s
Parsed: Juice’s Ghost Town
Parsed: Kurt’s Steakhouse
Parsed: Little Red Inn
Parsed: Mars Resort
Parsed: Michael’

In [49]:
type(clubs)

list

In [48]:
len(clubs)

304

In [47]:
print(json.dumps(clubs, ensure_ascii=False, indent=2))

[
  {
    "region": "southeast",
    "name": "5 O’Clock Club",
    "url": "https://wisconsinsupperclubs.com/supper-club/5-oclock-club-2/",
    "website": "https://the5oclockclubpewaukee.com",
    "address": "N28 W26658 Peterson Drive",
    "city": "Pewaukee",
    "state": "WI",
    "zip_code": "53072",
    "phone": "262-691-9960"
  },
  {
    "region": "southeast",
    "name": "Alpine Retreat",
    "url": "https://wisconsinsupperclubs.com/supper-club/alpine-retreat/",
    "website": "https://www.alpineretreat.net/",
    "address": "1380 Friess Lake Road",
    "city": "Hubertus,",
    "state": "WI",
    "zip_code": "53033",
    "phone": "262-628-3995"
  },
  {
    "region": "southeast",
    "name": "Bass Bay Brewhouse",
    "url": "https://wisconsinsupperclubs.com/supper-club/bass-bay-brewhouse/",
    "website": "https://www.bassbaybrewhouse.com",
    "address": "S79 W15851 Aud Mar Drive",
    "city": "Muskego,",
    "state": "WI",
    "zip_code": "53150",
    "phone": "414-377-9449"
  

In [46]:
# add coordinates lat/lon properties using geocoding api
from googlemaps import Client
from typing import List, Dict
import time

def geocode_locations(locations: List[Dict], api_key: str) -> List[Dict]:
    """
    Geocode a list of location dictionaries using Google Maps API.
    
    Args:
        locations (List[Dict]): List of dictionaries containing location information
        api_key (str): Google Maps API key
    
    Returns:
        List[Dict]: Original location data with added latitude and longitude
    """
    # Initialize Google Maps client
    gmaps = Client(key=api_key)
    
    # Process each location
    for location in locations:
        try:
            # Create full address string
            full_address = f"{location['address']}, {location['city']}, {location['state']} {location['zip_code']}"
            
            # Get geocoding result
            result = gmaps.geocode(full_address)
            
            if result and len(result) > 0:
                # Extract latitude and longitude
                lat = result[0]['geometry']['location']['lat']
                lng = result[0]['geometry']['location']['lng']
                
                # Add coordinates to the location dictionary
                location['latitude'] = lat
                location['longitude'] = lng
                location['geocoding_status'] = 'success'
            else:
                # Handle case where no results were found
                location['latitude'] = None
                location['longitude'] = None
                location['geocoding_status'] = 'no_results'
                
            # Add small delay to respect API rate limits
            time.sleep(0.1)
            
        except Exception as e:
            # Handle any errors
            location['latitude'] = None
            location['longitude'] = None
            location['geocoding_status'] = f'error: {str(e)}'
            
    return locations

In [50]:
# Your API key from Google Maps
API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

if not API_KEY:
    raise ValueError("Google Maps API key not found in environment variables")

# 初始化 Google Maps client
gmaps = Client(key=API_KEY)

# Geocode the locations
geocoded_locations = geocode_locations(clubs, API_KEY)  # clubs have 304 objects

# Example of accessing the results
for location in geocoded_locations:
    if location['geocoding_status'] == 'success':
        print(f"Location: {location['name']}")
        print(f"Coordinates: {location['latitude']}, {location['longitude']}\n")
    else:
        print(f"Failed to geocode {location['name']}: {location['geocoding_status']}\n")

2024-11-02 16:30:53,486 - INFO - API queries_quota: 60


Location: 5 O’Clock Club
Coordinates: 43.0726325, -88.2778795

Location: Alpine Retreat
Coordinates: 43.2389669, -88.2877987

Location: Bass Bay Brewhouse
Coordinates: 42.8998736, -88.11251

Location: Bayside Supper Club
Coordinates: 43.4432893, -88.8609065

Location: Blue Heron Supper Club
Coordinates: 42.8839956, -88.2088031

Location: Brittain House Supper Club
Coordinates: 43.6224959, -88.7243357

Location: Butler Inn of Pewaukee
Coordinates: 43.087706, -88.2774033

Location: Cantafio’s Buckhorn Steakhouse
Coordinates: 43.5139937, -88.8103179

Location: Clifford’s Supper Club
Coordinates: 42.9395025, -88.0440843

Location: Colony House **
Coordinates: 42.5062829, -88.1208666

Location: Copper Dock, The *†
Coordinates: 43.2407209, -88.2709137

Location: Corner House Supper Club
Coordinates: 42.731913, -87.7800798

Location: Diamond Jim’s Steakhouse *†
Coordinates: 42.9364464, -88.06108150000001

Location: Duck Inn, The *†
Coordinates: 42.7161862, -88.7403021

Location: Edgewater Sup

In [56]:
for i, club in enumerate(clubs):
    if club['latitude'] is None or club['longitude'] is None:
        print(i,club)

In [61]:
# Convert list of dictionaries to DataFrame
clubs_geocoded_df = pd.DataFrame(clubs)
clubs_geocoded_df.columns

Index(['region', 'name', 'url', 'website', 'address', 'city', 'state',
       'zip_code', 'phone', 'latitude', 'longitude', 'geocoding_status'],
      dtype='object')

In [63]:
# add id to each row
clubs_geocoded_df['id'] = clubs_geocoded_df.index
clubs_geocoded_df

Unnamed: 0,region,name,url,website,address,city,state,zip_code,phone,latitude,longitude,geocoding_status,id
0,southeast,5 O’Clock Club,https://wisconsinsupperclubs.com/supper-club/5...,https://the5oclockclubpewaukee.com,N28 W26658 Peterson Drive,Pewaukee,WI,53072,262-691-9960,43.072632,-88.277879,success,0
1,southeast,Alpine Retreat,https://wisconsinsupperclubs.com/supper-club/a...,https://www.alpineretreat.net/,1380 Friess Lake Road,"Hubertus,",WI,53033,262-628-3995,43.238967,-88.287799,success,1
2,southeast,Bass Bay Brewhouse,https://wisconsinsupperclubs.com/supper-club/b...,https://www.bassbaybrewhouse.com,S79 W15851 Aud Mar Drive,"Muskego,",WI,53150,414-377-9449,42.899874,-88.112510,success,2
3,southeast,Bayside Supper Club,https://wisconsinsupperclubs.com/supper-club/b...,https://www.restaurantji.com/wi/beaver-dam/bay...,W9231 County Road G,"Beaver Dam,",WI,53916,920-887-0505,43.443289,-88.860906,success,3
4,southeast,Blue Heron Supper Club,https://wisconsinsupperclubs.com/supper-club/b...,https://www.facebook.com/theblueheronsupperclub/,W229 S8300 Hwy 164,"Big Bend,",WI,53103,262-662-9985,42.883996,-88.208803,success,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,northwest,Turtle Club,https://wisconsinsupperclubs.com/supper-club/t...,https://www.restaurantji.com/wi/medford/turtle...,W7944 Perkinstown Avenue,"Medford,",WI,54451,715-748-2975,45.191535,-90.450728,success,299
300,northwest,Wason’s Supper Club,https://wisconsinsupperclubs.com/supper-club/w...,http://www.wasons.com,W2182 Hwy 54,"Galesville,",WI,54630,608-582-2763,44.076911,-91.356614,success,300
301,northwest,West Wind Supper Club,https://wisconsinsupperclubs.com/supper-club/w...,https://thewestwind.com,709 North Main Street,"River Falls,",WI,54022,715-425-8100,44.868471,-92.622308,success,301
302,northwest,Wildcat Supper Club,https://wisconsinsupperclubs.com/supper-club/w...,https://www.facebook.com/Wildcatsupperclub/,W8270 County Road B,"Neillsville,",WI,54456,715-743-2762,44.573328,-90.729321,success,302


In [66]:
def df_to_csv(
    df: pd.DataFrame,
    filename: str = None,
    output_dir: str = "output"
) -> str:
    """
    Export DataFrame to CSV file.
    
    Args:
        df: DataFrame to export
        filename: Name of output file (without .csv extension)
        output_dir: Directory to save the file
    
    Returns:
        str: Path to saved CSV file
    """
    try:
        # Create output directory if it doesn't exist
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        
        # Generate filename if not provided
        if filename is None:
            filename = "data_export"
        
        full_filename = f"{filename}.csv"
        file_path = os.path.join(output_dir, full_filename)
        
        # Export to CSV
        df.to_csv(
            file_path,
            encoding='utf-8',
            index=False
        )
        
        print(f"Successfully exported to: {file_path}")
        print(f"Rows: {len(df)}, Columns: {len(df.columns)}")
        
        return file_path
        
    except FileNotFoundError as e:
        print(f"Directory error: {str(e)}")
    except PermissionError as e:
        print(f"Permission error: {str(e)}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
    return None

In [69]:
df_to_csv(clubs_geocoded_df, filename="wi_supper_clubs_2023", output_dir="data/csv")

Successfully exported to: data/csv/wi_supper_clubs_2023.csv
Rows: 304, Columns: 13


'data/csv/wi_supper_clubs_2023.csv'