<a href="https://colab.research.google.com/github/weedge/doraemon-nb/blob/main/redisxann_usearch_implement_Geo_Spatial_Indexing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# run redisxann-usearch server

In [None]:
!curl "https://raw.githubusercontent.com/weedge/RedisXANN/main/scripts/run_redisxann.sh" | bash

In [82]:
!ps -ef | grep redis | grep -v grep

root       83060       1  0 14:07 ?        00:00:00 redis-server *:6666


# Data
1. https://www.kaggle.com/datasets/liewyousheng/geolocation
2. https://github.com/dr5hn/countries-states-cities-database

Total Regions : 6

Total Sub Regions : 22

Total Countries : 250

Total States/Regions/Municipalities : 5,081

Total Cities/Towns/Districts : 150,540

Last Updated On : 3rd Sept 2023



In [None]:
!wget "https://storage.googleapis.com/kaggle-data-sets/1991032/3288213/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20231020%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20231020T102623Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=53c07f605a39c58a922da41f77b81a9707880f50f8280c52886c68826f290296bbb8b46254ed1764ca67ea4e2d440c53829140401f617618a62c9c6cdbd72a6cdb51f0ca37346afae4f105d1e9cb8cce8a160ec60f3a2aac0f7755a405a778d9a01b0a83446da2a59dad5b792aa36993e471b49f6ad5a817bdfebb966a230621018b89d27d4fbc7b2c30d33b264b54fb93b7be2a8f001c9ca23d2060d805d5bb8c4e2aab9a1539d53a2267f42c3f88d3e46554305e031f8b2abdff5a333cff409a3778b8b164b0d0fc520d820e79acfeb35bcd1cedb1fbb6fd7befa0e643ac592fe9d9da40114119d6a7777b5be2c5c84f407dedd0686239ca7a452c60d12c0c" -O data.zip \
  && unzip data.zip -d ./data

## use local usearch to impl geo spatial indexing

In [None]:
!pip install usearch

In [63]:
from usearch.index import Index

import pandas as pd
import numpy as np
import geocoder

my_coordinates = np.array(geocoder.ip('me').latlng, dtype=np.float32)
print(my_coordinates)

df = pd.read_csv('./data/cities.csv')
coordinates = np.zeros((df.shape[0], 2), dtype=np.float32)
coordinates[:, 0] = df['latitude'].to_numpy(dtype=np.float32)
coordinates[:, 1] = df['longitude'].to_numpy(dtype=np.float32)
labels = np.array(range(df.shape[0]), dtype=np.longlong)
print(len(coordinates), len(labels))

index = Index(metric='haversine',ndim=2)
index.add(labels, coordinates)

matches = index.search(my_coordinates, 10)
print(matches.keys,matches.distances)
print(df.iloc[matches.keys])


[ 32.8546 -79.9748]
148061 148061
[142691 142613 142544 142545 142680 142603 142632 142629 142744 142663] [0.         0.00109083 0.00142492 0.00187566 0.00213352 0.00236609
 0.00236659 0.00295878 0.00320069 0.00328491]
            id               name  state_id state_code      state_name  \
142691  123005   North Charleston      1443         SC  South Carolina   
142613  117942            Hanahan      1443         SC  South Carolina   
142544  113765         Charleston      1443         SC  South Carolina   
142545  113769  Charleston County      1443         SC  South Carolina   
142680  122376     Mount Pleasant      1443         SC  South Carolina   
142603  117394        Goose Creek      1443         SC  South Carolina   
142632  119117       James Island      1443         SC  South Carolina   
142629  119032      Isle of Palms      1443         SC  South Carolina   
142744  127192   Sullivans Island      1443         SC  South Carolina   
142663  120553       Lincolnville      14

## use redisxann-usearch to impl geo spatial indexing

In [25]:
!pip install redisx

Collecting redisx
  Downloading redisx-0.1.7-py3-none-any.whl (7.3 kB)
Collecting redis==5.0.0 (from redisx)
  Downloading redis-5.0.0-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.1/250.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, redisx
Successfully installed redis-5.0.0 redisx-0.1.7


In [30]:
from random import random
from redisx.client import Client
import numpy as np


# change the following configuration for your redis.
REDIS_HOST = "localhost"
REDIS_PORT = 6666
REDIS_DB = 0
REDIS_USERNAME = ""
REDIS_PASSWORD = ""


def get_client() -> Client:
    return Client(
        host=REDIS_HOST,
        port=REDIS_PORT,
        db=REDIS_DB,
        username=REDIS_USERNAME,
        password=REDIS_PASSWORD,
    )


In [43]:
from typing import Union

from redis import ResponseError
from redisx.define import UsearchQuantizationType,DistanceMetric
from redisx.ann_usearch import VectorType

cli = get_client()

def create_index(index_name: str, dim:int):
    try:
        return cli.create_index(
            index_name, dim,
            metric=DistanceMetric.Haversine,
            quantization=UsearchQuantizationType.F32)
    except ResponseError as e:
        print(e)
        return None

def get_index(index_name: str):
    try:
        return cli.get_index(index_name)
    except ResponseError as e:
        print(e)
        return None

def delete_index(index_name: str):
    try:
        return cli.del_index(index_name)
    except ResponseError as e:
        print(e)
        return False

def add_vector(index_name: str, name: str, vector: Union[VectorType, str]):
    try:
        return cli.add_vector(index_name, name, vector)
    except ResponseError as e:
        print(e)
        return None

def add_vector_id(index_name: str, id: int, vector: Union[VectorType, str]):
    try:
        return cli.add_vector_id(index_name, id, vector)
    except ResponseError as e:
        print(e)
        return None

def get_vector(index_name: str, name: str):
    try:
        return cli.get_vector(index_name, name)
    except ResponseError as e:
        print(e)
        return None

def del_vector(index_name: str, name: str):
    try:
        return cli.del_vector(index_name, name)
    except ResponseError as e:
        print(e)
        return False

def kann_search(index_name: str, k: int, query_vector: Union[VectorType, str]):
    try:
        return cli.kann_search(index_name, k, query_vector)
    except ResponseError as e:
        print(e)
        return False


In [83]:
delete_index("gis_index")

1

In [84]:
create_index("gis_index",2)
get_index("gis_index")

{'name': 'usearch.gis_index',
 'dimensions': 2,
 'metric': 'Haversine',
 'quantization': 'F32',
 'connectivity': 10,
 'expansion_add': 128,
 'expansion_search': 3,
 'serialization_file_path': '/content/0.usearch.gis_index.idx',
 'serialized_length': 112,
 'index_size': 0,
 'index_capacity': 10,
 'index_mem_usage': 336}

In [86]:
import pandas as pd
import numpy as np
import geocoder
import ctypes


def load_city_coordinates(path: str):
    df = pd.read_csv(path)
    coordinates = np.zeros((df.shape[0], 2), dtype=np.float32)
    coordinates[:, 0] = df['latitude'].to_numpy(dtype=np.float32)
    coordinates[:, 1] = df['longitude'].to_numpy(dtype=np.float32)
    labels = np.array(range(df.shape[0]), dtype=np.longlong)
    print(len(labels),len(coordinates))
    for i,label in enumerate(labels):
        add_vector_id("gis_index", ctypes.c_long(labels[i]).value, coordinates[i])

def do_search(coordinate: Union[VectorType, str],k: int):
    res = kann_search("gis_index", k, coordinate)
    if res is False:
        return
    keys = []
    for item in res["vals"]:
        keys.append(item["id"])
    print(df.iloc[keys])


In [85]:
load_city_coordinates('./data/cities.csv')

148061
148061


In [87]:
get_index("gis_index")

{'name': 'usearch.gis_index',
 'dimensions': 2,
 'metric': 'Haversine',
 'quantization': 'F32',
 'connectivity': 10,
 'expansion_add': 128,
 'expansion_search': 3,
 'serialization_file_path': '/content/0.usearch.gis_index.idx',
 'serialized_length': 15398632,
 'index_size': 148061,
 'index_capacity': 296120,
 'index_mem_usage': 40117824}

In [88]:
my_coordinate = np.array(geocoder.ip('me').latlng, dtype=np.float32)
my_coordinate

array([ 32.8546, -79.9748], dtype=float32)

In [89]:
do_search(my_coordinate, 10)

            id               name  state_id state_code      state_name  \
142691  123005   North Charleston      1443         SC  South Carolina   
142613  117942            Hanahan      1443         SC  South Carolina   
142544  113765         Charleston      1443         SC  South Carolina   
142545  113769  Charleston County      1443         SC  South Carolina   
142680  122376     Mount Pleasant      1443         SC  South Carolina   
142632  119117       James Island      1443         SC  South Carolina   
142603  117394        Goose Creek      1443         SC  South Carolina   
142744  127192   Sullivans Island      1443         SC  South Carolina   
142629  119032      Isle of Palms      1443         SC  South Carolina   
142642  119827             Ladson      1443         SC  South Carolina   

        country_id country_code   country_name  latitude  longitude wikiDataId  
142691         233           US  United States  32.85462  -79.97481    Q847538  
142613         233     

# Reference
1. https://ashvardanian.com/posts/abusing-vector-search/