In [128]:
import pandas as pd
import requests
import string
from fuzzywuzzy import fuzz

In [129]:
original_df = pd.read_excel('../Address_Validation_API_Overall_(Confusion Matrix).xlsx', sheet_name='API_Result')

# Drop the rows woth nan values in the column 'Address'
original_df.dropna(subset=['Address'], inplace=True)

# Drop the rows having non string values in the column 'Address'
original_df = original_df[original_df['Address'].apply(lambda x: isinstance(x, str))]

original_df.head()

Unnamed: 0,Address,Area,City,Status,Address_validation,Result
0,H E 863 Fatihabad,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0
1,Cavalry ground street 1 villa 1A NA Home,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0
2,House No 152A Street No 6 Block A,Nishat Colony,Lahore Cantt,delivered,Yes,0
3,House no E96d Ali lane street no 7,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0
4,House no E96d Ali lane street no 7,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0


In [130]:
# Make a deep copy of the original dataframe
df = original_df.copy(deep=True)

In [131]:
# toal values in dataframe
df.shape

  df = df.iloc[:150].append(df.iloc[-50:])


(200, 6)

In [132]:
# # Remove extra spaces from the address Column
# df['Address'] = df['Address'].str.strip()
# df['Address'] = df['Address'].str.replace(' +', ' ')

# # Convert the addresses to lowercase
# df['Address'] = df['Address'].str.lower()

# Abbreviation mapping dictionary
abbreviations = {
    'st': 'street',
    'ave': 'avenue',
    'apt': 'apartment',
    'rd': 'road',
    'h': 'house',
    'hse': 'house',
    'blvd': 'boulevard',
    'apt': 'apartment',
    'bldg': 'building',
    'pl': 'place',
    'ln': 'lane',
    'hw': 'highway',
    'hwy': 'highway',
    'expy': 'expressway',
    'exp': 'expressway',
    'expw': 'expressway',
    'ext': 'extension',
    'n': 'north',
    's': 'south',
    'e': 'east',
    'w': 'west',
    'lhr': 'lahore',
    'khi': 'karachi',
    'isb': 'islamabad',
    'rwp': 'rawalpindi',
    'kar': 'karachi',
    'pak': 'pakistan',
    'pnjb': 'punjab',
    'pjb': 'punjab',
    'isl': 'islamabad'
}

# Replace abbreviations with full forms to standardize the data
# for abbrev, full_form in abbreviations.items():
#     df['Address'] = df['Address'].str.replace(r'\b{}\b'.format(abbrev), full_form)

# Make a new column called 'Full_Address' that combines the Address, Area, and City
df['Full_Address'] = df['Address'] + ', ' + df['Area'] + ', ' + df['City']

In [133]:
df.head()

Unnamed: 0,Address,Area,City,Status,Address_validation,Result,Full_Address
0,H E 863 Fatihabad,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0,"H E 863 Fatihabad , Cavalry Ground Extension, ..."
1,Cavalry ground street 1 villa 1A NA Home,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0,"Cavalry ground street 1 villa 1A NA Home, Cava..."
2,House No 152A Street No 6 Block A,Nishat Colony,Lahore Cantt,delivered,Yes,0,"House No 152A Street No 6 Block A, Nishat Colo..."
3,House no E96d Ali lane street no 7,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0,"House no E96d Ali lane street no 7, Cavalry Gr..."
4,House no E96d Ali lane street no 7,Cavalry Ground Extension,Lahore Cantt,delivered,Yes,0,"House no E96d Ali lane street no 7 , Cavalry G..."


In [134]:
# Define your API key and endpoint
API_KEY = None
GEOCODING_ENDPOINT = "https://maps.googleapis.com/maps/api/geocode/json"

def geocode_address(address):
    params = {
        'address': address,
        'key': API_KEY,
    }

    response = requests.get(GEOCODING_ENDPOINT, params=params)

    validResponse = 'VALID'
    invalidResponse = 'INVALID'
    statusCodeError = 'STATUS_CODE_ERROR'
    overQueryLimit = "OVER_QUERY_LIMIT"


    if response.status_code == 200:
        geocoding_data = response.json()

        # Check if the geocoding was successful
        if geocoding_data['status'] == 'OK':
            # If there are more than one result, return invalid response (since the request was ambiguous)
            if len(geocoding_data['results']) > 1:
                return invalidResponse
            else:
                result = geocoding_data['results'][0]

                # Look for particular tags in the result types
                if 'street_address' in result['types']:
                    return validResponse
                elif 'subpremise' in result['types']:
                    return validResponse
                elif 'premise' in result['types']:
                    return validResponse
                elif 'street_number' in result['types']:
                    return validResponse
                elif 'room' in result['types']:
                    return validResponse
                elif 'floor' in result['types']:
                    return validResponse
                else:
                    formatted_address = result['formatted_address']

                    # formatted_address = formatted_address.translate(str.maketrans('', '', string.punctuation))
                    # formatted_address = formatted_address.strip()
                    # formatted_address = ' '.join(formatted_address.split())
                    # formatted_address = formatted_address.lower()

                    # Remove punctuation from address
                    # address = address.translate(str.maketrans('', '', string.punctuation))

                    # Check if the addresses match
                    similarity_score = fuzz.token_set_ratio(address, formatted_address)

                    # If the similarity score is greater than 60, return valid response
                    if similarity_score >= 50:
                        return validResponse
                    else:
                        return invalidResponse

        elif geocoding_data['status'] == 'ZERO_RESULTS':
            return invalidResponse
        elif geocoding_data['status'] == 'OVER_QUERY_LIMIT':
            return overQueryLimit
        else: return geocoding_data['status']

    else:
        return statusCodeError

# Apply the geocode_address function to each row in the DataFrame
df['API_Result'] = df['Full_Address'].apply(geocode_address)