In [19]:
import pandas as pd
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import requests
tqdm.pandas()  # Enable progress bar for Pandas apply
current_dir = Path.cwd()

In [35]:
# Main Files used
current_dir = Path.cwd()
parent_dir = current_dir.parent
realtor_csv = parent_dir / "2_data_cleaning" / "cleaned_csv"/ "realtor_dropped_duplicates.csv"
realtor_coordinate_csv = parent_dir / "2_data_cleaning" / "cleaned_csv"/ "realtor_with_coords1.csv"
api_key = "AIzaSyDXMBJRnzDdlWcJIqpCzAioAvEvzuGgOR0"


In [36]:
def get_coordinates(address, api_key):
    """
    Fetch latitude and longitude for a given address using Google Geocoding API.
    
    Parameters:
        address (str): The address to geocode.
        api_key (str): API key for Google Geocoding API.
    
    Returns:
        tuple: Latitude and Longitude as (lat, long). Returns (None, None) if an error occurs.
    """
    try:
        # Construct the Geocoding API URL
        base_url = "https://maps.googleapis.com/maps/api/geocode/json"
        params = {
            "address": address,
            "key": api_key
        }

        # Send the request to the API
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

        # Parse the JSON response
        data = response.json()

        if data["status"] == "OK":
            # Extract latitude and longitude from the response
            location = data["results"][0]["geometry"]["location"]
            return location["lat"], location["lng"]
        else:
            print(f"Geocoding API error for address '{address}': {data['status']}")
            return None, None
    except requests.exceptions.RequestException as e:
        print(f"Request error for address '{address}': {e}")
        return None, None

def process_dataframe_in_chunks(df, chunk_size, api_key, output_csv):
    """
    Processes a DataFrame in chunks to find latitude and longitude for each row.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing the 'address' column.
        chunk_size (int): Number of rows to process in each chunk.
        api_key (str): API key for the geocoding service.
        output_csv (str): Filepath to append the results.

    Returns:
        None
    """
    # Check if the output CSV already exists
    try:
        processed_rows = pd.read_csv(output_csv).shape[0]
        print(f"Resuming from row {processed_rows}.")
    except FileNotFoundError:
        # Initialize the output file if it doesn't exist
        processed_rows = 0
        pd.DataFrame().to_csv(output_csv, index=False)

    # Process the DataFrame in chunks
    for start_idx in range(processed_rows, len(df), chunk_size):
        chunk = df.iloc[start_idx:start_idx + chunk_size].copy()  # Copy the chunk

        # Process each row in the chunk to get lat and long
        chunk[['lat', 'long']] = chunk['address'].progress_apply(
            lambda addr: pd.Series(get_coordinates(addr, api_key))
        )

        # Append the results to the CSV
        chunk.to_csv(output_csv, mode='a', header=False, index=False)
        print(f"Processed rows {start_idx} to {start_idx + len(chunk)} and saved to {output_csv}.")




In [None]:
realtor_data = pd.read_csv(realtor_csv)
process_dataframe_in_chunks(realtor_data,20, api_key, realtor_coordinate_csv)

In [78]:
def clean_and_update_coords(input_csv, output_csv):
    """
    Cleans and updates a coordinates dataset by resetting indices, renaming columns, 
    removing rows with missing coordinates, and saving the cleaned DataFrame.

    Parameters:
        input_csv (str): Path to the input CSV containing raw coordinates.
        output_csv (str): Path to save the cleaned and updated CSV.
        original_columns (list): Original column names from the source dataset.

    Returns:
        pd.DataFrame: The cleaned DataFrame with updated columns.
    """
    # Load the dataset
    coords_df = pd.read_csv(output_csv)
    coords_df.reset_index(drop=True, inplace=True)

    
    # print(coords_df)

    # Update column names
    new_columns = list(coords_df) + ["lat", "long"]
    coords_df.columns = new_columns



    #Do later in cleaning
    # # Drop rows with missing lat/long values
    # coords_df.dropna(subset=['lat', 'long'], inplace=True)
    # print(f"Remaining rows after dropping missing coordinates: {len(coords_df)}")

    # Save the cleaned DataFrame
    coords_df.to_csv(output_csv, index=False)

    return coords_df



In [79]:
original_columns = list(pd.read_csv(realtor_csv).columns)

clean_and_update_coords(realtor_csv, realtor_coordinate_csv).head()


ValueError: Length mismatch: Expected axis has 75 elements, new values have 77 elements

In [63]:
zolo_csv= current_dir.parent/'2_data_cleaning'/'cleaned_csv'/'cleaned_zolo_from_newest_scraping.csv'
zolo_data = pd.read_csv(zolo_csv)
output_csv = current_dir.parent/'2_data_cleaning'/'cleaned_csv'/"zolo_with_coords.csv"


In [67]:

process_dataframe_in_chunks(zolo_data, chunk_size=20, api_key=api_key, output_csv=output_csv)


Resuming from row 1707.


In [71]:
clean_and_update_coords(zolo_csv, output_csv).head()


ValueError: Length mismatch: Expected axis has 244 elements, new values have 223 elements

In [80]:
pd.read_csv(output_csv).columns


Index(['Unnamed: 0', 'price', 'address', 'rooms', 'room dimensions',
       'room_properties', 'Status', 'Type', 'Style', 'Size (sq ft)',
       ...
       'Area Influence', 'Com_cn_fee', 'Ceil Height (ft)', 'Ceiling Height',
       'Crane', 'Industrial Area', 'Industrial Area Units', 'Central Vac',
       'lat', 'long'],
      dtype='object', length=244)