In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to fetch and parse a URL
def fetch_and_parse(url, headers):
    print(f"Fetching URL: {url}")  # Debug statement to print the URL being fetched
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # Ensure we notice bad responses
    return BeautifulSoup(response.content, 'html.parser')

# Function to extract tables from a BeautifulSoup object
def extract_tables_from_soup(soup):
    tables = soup.find_all('table')
    table_data = []
    for table in tables:
        headers = []
        rows = []
        for i, row in enumerate(table.find_all('tr')):
            cols = row.find_all(['td', 'th'])
            if i == 0:
                headers = [col.text.strip() for col in cols]
            else:
                rows.append([col.text.strip() for col in cols])
        if headers and rows:
            table_data.append((headers, rows))
    return table_data

# Function to extract the table with the most rows
def get_table_with_most_rows(tables):
    max_rows = 0
    best_table = None
    for table_headers, rows in tables:
        if len(rows) > max_rows:
            max_rows = len(rows)
            best_table = (table_headers, rows)
    return best_table

# Main URL and headers
main_url = 'https://www.checkee.info/'
request_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Column to exclude (user input)
column_to_exclude = input("Enter the column name to exclude from dataset: (Preferably 'Details') ")

# Step 1: Fetch and parse the main page
main_soup = fetch_and_parse(main_url, request_headers)

# Step 2: Extract href links within the main page tables that match the specific pattern
hrefs = [a['href'] for table in main_soup.find_all('table') for a in table.find_all('a', href=True) if 'main.php?dispdate=' in a['href']]

# Debug statement to print the extracted hrefs
print(f"Extracted hrefs: {hrefs}")

# Step 3: Fetch and parse each href link found in the main page tables
base_url = 'https://www.checkee.info/'
all_dfs = []

for href in hrefs:
    url = base_url + href.lstrip('./')
    print(f"Fetching table from URL: {url}")  # Debug statement to print the URL being fetched
    soup = fetch_and_parse(url, request_headers)
    tables = extract_tables_from_soup(soup)
    best_table = get_table_with_most_rows(tables)
    if best_table:
        table_headers, rows = best_table
        if column_to_exclude in table_headers:
            col_index = table_headers.index(column_to_exclude)
            table_headers.pop(col_index)
            for row in rows:
                row.pop(col_index)
        # Ensure all rows have the same number of columns as headers
        filtered_rows = [row for row in rows if len(row) == len(table_headers)]
        df = pd.DataFrame(filtered_rows, columns=table_headers)
        all_dfs.append(df)

# Print the extracted tables with the specified column excluded
for i, df in enumerate(all_dfs):
    print(f"Table {i+1}")
    print(df)
    print("\n")



Enter the column name to exclude:  Details


Fetching URL: https://www.checkee.info/
Extracted hrefs: ['./main.php?dispdate=2024-05', './main.php?dispdate=2024-04', './main.php?dispdate=2024-03', './main.php?dispdate=2024-02', './main.php?dispdate=2024-01', './main.php?dispdate=2023-12', './main.php?dispdate=2023-11', './main.php?dispdate=2023-10', './main.php?dispdate=2023-09', './main.php?dispdate=2023-08', './main.php?dispdate=2023-07', './main.php?dispdate=2023-06', './main.php?dispdate=2023-05', './main.php?dispdate=2023-04', './main.php?dispdate=2023-03', './main.php?dispdate=2023-02', './main.php?dispdate=2023-01', './main.php?dispdate=2022-12', './main.php?dispdate=2022-11', './main.php?dispdate=2022-10', './main.php?dispdate=2022-09', './main.php?dispdate=2022-08', './main.php?dispdate=2022-07', './main.php?dispdate=2022-06', './main.php?dispdate=2022-05', './main.php?dispdate=2022-04', './main.php?dispdate=2022-03', './main.php?dispdate=2022-02', './main.php?dispdate=2022-01', './main.php?dispdate=2021-12', './main.php?

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2024-01
Fetching URL: https://www.checkee.info/main.php?dispdate=2024-01
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-12
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-12
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-11
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-11


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-10
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-10
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-09
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-09
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-08
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-08


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-07
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-07
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-06
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-06


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-05
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-05
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-04
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-04


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-03
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-03
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-02
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-02
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2023-01
Fetching URL: https://www.checkee.info/main.php?dispdate=2023-01
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-12
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-12


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-11
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-11


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-10
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-10
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-09
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-09
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-08
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-08


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-07
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-07


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-06
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-06


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-05
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-05
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-04
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-04
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-03
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-03
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-02
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-02
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2022-01
Fetching URL: https://www.checkee.info/main.php?dispdate=2022-01
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-12
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-12
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-11
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-11


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-10
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-10


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-09
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-09
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-08
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-08


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-07
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-07
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-06
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-06
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-05
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-05
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-04
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-04
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-03
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-03
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-02
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-02
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2021-01
Fetching URL: https://www.checkee.info/main.php?dispdate=2021-01
Fetching tabl

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-09
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-09


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-08
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-08
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-07
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-07
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-06
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-06


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-05
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-05
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-04
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-04
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-03
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-03
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-02
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-02
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2019-01
Fetching URL: https://www.checkee.info/main.php?dispdate=2019-01


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-12
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-12


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-11
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-11
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-10
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-10


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-09
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-09
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-08
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-08
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-07
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-07
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-06
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-06


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-05
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-05
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-04
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-04
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-03
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-03
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-02
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-02
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2018-01
Fetching URL: https://www.checkee.info/main.php?dispdate=2018-01
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2017-12
Fetching URL: https://www.checkee.info/main.php?dispdate=2017-12
Fetching table from URL: https://www.checkee.info/main.php?dispdate=2017-11
Fetching URL: https://www.checkee.info/main.php?dispdate=2017-11
Fetching tabl

In [13]:
filtered_dfs = []

for df in all_dfs:
    if 'Complete Date' in df.columns:
        df = df[df['Complete Date'] != '0000-00-00']
    filtered_dfs.append(df)