In [13]:
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()

df = pd.read_csv(current_dir.parent/'house_data'/'extracted_houses_housesigma.csv')


In [14]:
import pandas as pd
from tqdm import tqdm
import requests

tqdm.pandas()  # Enable progress bar for Pandas apply



In [None]:
def get_coordinates(address, api_key):
    """
    Fetch latitude and longitude for a given address using Google Geocoding API.
    
    Parameters:
        address (str): The address to geocode.
        api_key (str): API key for Google Geocoding API.
    
    Returns:
        tuple: Latitude and Longitude as (lat, long). Returns (None, None) if an error occurs.
    """
    try:
        # Construct the Geocoding API URL
        base_url = "https://maps.googleapis.com/maps/api/geocode/json"
        params = {
            "address": address,
            "key": api_key
        }

        # Send the request to the API
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

        # Parse the JSON response
        data = response.json()

        if data["status"] == "OK":
            # Extract latitude and longitude from the response
            location = data["results"][0]["geometry"]["location"]
            return location["lat"], location["lng"]
        else:
            print(f"Geocoding API error for address '{address}': {data['status']}")
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Request error for address '{address}': {e}")
        return None, None

def process_dataframe_in_chunks(df, chunk_size, api_key, output_csv):
    """
    Processes a DataFrame in chunks to find latitude and longitude for each row.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing the 'address' column.
        chunk_size (int): Number of rows to process in each chunk.
        api_key (str): API key for the geocoding service.
        output_csv (str): Filepath to append the results.

    Returns:
        None
    """
    # Check if the output CSV already exists
    try:
        processed_rows = pd.read_csv(output_csv).shape[0]
        print(f"Resuming from row {processed_rows}.")
    except FileNotFoundError:
        # Initialize the output file if it doesn't exist
        processed_rows = 0
        pd.DataFrame().to_csv(output_csv, index=False)

    # Process the DataFrame in chunks
    for start_idx in range(processed_rows, len(df), chunk_size):
        chunk = df.iloc[start_idx:start_idx + chunk_size].copy()  # Copy the chunk

        # Process each row in the chunk to get lat and long
        chunk[['lat', 'long']] = chunk['address'].progress_apply(
            lambda addr: pd.Series(get_coordinates(addr, api_key))
        )

        # Append the results to the CSV
        chunk.to_csv(output_csv, mode='a', header=False, index=False)
        print(f"Processed rows {start_idx} to {start_idx + len(chunk)} and saved to {output_csv}.")




In [15]:
from pathlib import Path
current_dir = Path.cwd().parent
realtor_data = pd.read_csv(current_dir/"data_cleaning"/"realtor_dropped_duplicates.csv")
api_key = "AIzaSyC3v8fY2GaJdCXQxIapLHn5ocflMuBIiVY"
output_csv = current_dir/"house_data"/"realtor_with_coords.csv"
process_dataframe_in_chunks(realtor_data, chunk_size=20, api_key=api_key, output_csv=output_csv)


Resuming from row 0.


100%|██████████| 20/20 [00:04<00:00,  4.03it/s]


Processed rows 0 to 20 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:04<00:00,  4.16it/s]


Processed rows 20 to 40 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.64it/s]


Processed rows 40 to 60 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.67it/s]


Processed rows 60 to 80 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


 40%|████      | 8/20 [00:10<00:38,  3.17s/it]

Request error for address '115 - 360 RIDELLE AVENUEToronto (Briar Hill-Belgravia), Ontario M6B1K1': ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


Processed rows 80 to 100 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.80it/s]


Processed rows 100 to 120 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:04<00:00,  4.12it/s]


Processed rows 120 to 140 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.49it/s]


Processed rows 140 to 160 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.60it/s]


Processed rows 160 to 180 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.64it/s]


Processed rows 180 to 200 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.82it/s]


Processed rows 200 to 220 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.44it/s]


Processed rows 220 to 240 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:08<00:00,  2.35it/s]


Processed rows 240 to 260 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.69it/s]


Processed rows 260 to 280 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.31it/s]


Processed rows 280 to 300 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.56it/s]


Processed rows 300 to 320 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.74it/s]


Processed rows 320 to 340 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:04<00:00,  4.02it/s]


Processed rows 340 to 360 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.67it/s]


Processed rows 360 to 380 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.51it/s]


Processed rows 380 to 400 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.98it/s]


Processed rows 400 to 420 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.84it/s]


Processed rows 420 to 440 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.97it/s]


Processed rows 440 to 460 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.64it/s]


Processed rows 460 to 480 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.59it/s]


Processed rows 480 to 500 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.28it/s]


Processed rows 500 to 520 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.28it/s]


Processed rows 520 to 540 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:12<00:00,  1.65it/s]


Processed rows 540 to 560 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


 50%|█████     | 10/20 [00:03<00:02,  3.48it/s]

Geocoding API error for address 'Address not available': ZERO_RESULTS


100%|██████████| 20/20 [00:06<00:00,  3.00it/s]


Processed rows 560 to 580 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.19it/s]


Processed rows 580 to 600 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.89it/s]


Processed rows 600 to 620 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.96it/s]


Processed rows 620 to 640 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.85it/s]


Processed rows 640 to 660 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.64it/s]


Processed rows 660 to 680 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.95it/s]


Processed rows 680 to 700 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.32it/s]


Processed rows 700 to 720 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:09<00:00,  2.13it/s]


Processed rows 720 to 740 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.38it/s]


Processed rows 740 to 760 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.31it/s]


Processed rows 760 to 780 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.76it/s]


Processed rows 780 to 800 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.75it/s]


Processed rows 800 to 820 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.69it/s]


Processed rows 820 to 840 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:06<00:00,  3.15it/s]


Processed rows 840 to 860 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.56it/s]


Processed rows 860 to 880 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.76it/s]


Processed rows 880 to 900 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.69it/s]


Processed rows 900 to 920 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.62it/s]


Processed rows 920 to 940 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.74it/s]


Processed rows 940 to 960 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.72it/s]


Processed rows 960 to 980 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.46it/s]


Processed rows 980 to 1000 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.73it/s]


Processed rows 1000 to 1020 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.67it/s]


Processed rows 1020 to 1040 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 20/20 [00:05<00:00,  3.88it/s]


Processed rows 1040 to 1060 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.


100%|██████████| 10/10 [00:02<00:00,  4.38it/s]

Processed rows 1060 to 1070 and saved to c:\Users\Jensu\OneDrive\Documents\GitHub\CME358-Final_project\house_data\realtor_with_coords.csv.





In [None]:
realtor_with_coords = pd.read_csv(output_csv)
realtor_with_coords.reset_index(inplace=True)
realtor_with_coords_new_cols = list(realtor_data.columns)
realtor_with_coords_new_cols.append("lat")
realtor_with_coords_new_cols.append("long")

realtor_with_coords.columns = realtor_with_coords_new_cols
missing_coords_df = realtor_with_coords[
    realtor_with_coords['lat']== None
]
realtor_with_coords.isna().sum()
realtor_with_coords.dropna(subset=['lat','long'], inplace=True)
realtor_with_coords.isna().sum()

# realtor_with_coords.to_csv(output_csv)

price                 0
address               0
mls                   0
office_name           0
office_type          27
                   ... 
Age Of Building    1069
Business Type      1069
street address        0
lat                   2
long                  2
Length: 75, dtype: int64