In [100]:
import polars as pl
import pandas as pd
import json

In [101]:
df_data = pl.read_json('data.json')
df2 = pd.read_json('data.json')
df_old_data = pl.read_excel(r'Backup\Lojas Assaí.xlsx')

In [102]:
def start_pipeline(dataf):
    return dataf.clone()

def drop_columns_new(dataf):
    dataf = dataf.drop('url', 'subRegiao', 'subRegiaoTid', 'telefone', 'televendas', 'horario', 'email', 'ico_sust', 'voce_encontra', 'destaques', 'eslug', 'eid', 'e', 'tid', 'whatsapp', 'mapa', 'complemento', 'nid', 'cep', 'loja_id')
    return dataf

def drop_columns_old(dataf):
    dataf = dataf.drop('Unnamed: 4', 'Código Município')
    return dataf

def strip_chars_new(dataf):
    dataf = dataf.with_columns(pl.col("n").str.strip_chars())
    return dataf

def strip_chars_old(dataf):
    dataf = dataf.with_columns(pl.col("Unidade").str.strip_chars())
    return dataf

def rename_columns(dataf):
    dataf = dataf.rename({'n':'Unidade', 'c':'Município', 'uf':'UF', 'lat':'LAT', 'lon':'LONG', 'logradouro':'Endereço'})
    dataf = dataf.cast({"LAT": pl.Float64, "LONG": pl.Float64})
    return dataf

def sort_columns(dataf):
    dataf = dataf.select(['Unidade','Endereço','Município','UF','LAT','LONG'])
    return dataf




In [103]:
df = (df_data
 .pipe(start_pipeline)
 .pipe(drop_columns_new)
 .pipe(strip_chars_new)
 .pipe(rename_columns)
 .pipe(sort_columns))

df_old = (df_old_data
 .pipe(start_pipeline)
 .pipe(strip_chars_old)
 .pipe(drop_columns_old))

In [104]:

df_lower = df
df_old_lower = df_old
# Values in 'column_a' from df1 but not in 'column_b' from df2
only_in_df1 = df.filter(~pl.col("Unidade").str.to_lowercase().is_in(df_old['Unidade'].str.to_lowercase())).with_columns(pl.lit("Nova").alias("status"))

# Values in 'column_b' from df2 but not in 'column_a' from df1
only_in_df2 = df_old.filter(~pl.col('Unidade').str.to_lowercase().is_in(df["Unidade"].str.to_lowercase())).with_columns(pl.lit("Fechou").alias("status"))

# Common values in both dataframes with 'Antigas' status
common_in_both = df.filter(pl.col("Unidade").str.to_lowercase().is_in(df_old['Unidade'].str.to_lowercase())).with_columns(pl.lit("Antigas").alias("status"))

# Combine the results into one DataFrame
df_diff = pl.concat([
    only_in_df1.rename({"Unidade": "value"}),
    only_in_df2.rename({"Unidade": "value"}),
    common_in_both.rename({"Unidade": "value"})
])



In [105]:
df_diff

value,Endereço,Município,UF,LAT,LONG,status
str,str,str,str,f64,f64,str
"""Assaí Zona Norte""","""Rua Tancredo Neves, 528""","""Macapá""","""AP""",0.06762,-51.05688,"""Nova"""
"""Assaí Manaus Bola da Suframa""","""Rua Francisco Pereira da Silva…","""Manaus""","""AM""",-3.131969,-59.985704,"""Nova"""
"""Assaí Salvador Paralela""","""Avenida Governador Luis Viana …","""Salvador""","""BA""",-12.96494,-38.43848,"""Nova"""
"""Assaí Cais do Porto""","""Av. José Sabóia, 521""","""Fortaleza""","""CE""",-3.71718,-38.46671,"""Nova"""
"""Assaí Montese""","""Avenida Dos Expedicionários, 4…","""Fortaleza""","""CE""",-3.753169,-38.537952,"""Nova"""
…,…,…,…,…,…,…
"""Assaí Taboão da Serra""","""Rodovia Regis Bittencourt, 340""","""Taboão da Serra""","""SP""",-23.613221,-46.781066,"""Antigas"""
"""Assaí Marginal Tietê - Tatuapé""","""Rua Ulisses Cruz, nº 993""","""Tatuapé""","""SP""",-23.529796,-46.578482,"""Antigas"""
"""Assaí Taubaté""","""Avenida Dom Pedro I, 630 E,""","""Taubaté""","""SP""",-23.024431,-45.55644,"""Antigas"""
"""Assaí Palmas""","""Avenida Joaquim Teotônio Segur…","""Palmas""","""TO""",-10.250802,-48.333348,"""Antigas"""


In [106]:
# df_diff = df.filter(~pl.col("column_a").is_in(pl.col("column_b")))
df.write_excel("lojasAssai.xlsx", worksheet='lojasSite')
df_diff.write_excel("lojasAssaiDiff.xlsx", worksheet='lojasSite')


<xlsxwriter.workbook.Workbook at 0x19be891a6f0>

In [107]:
import polars as pl
from geopy.distance import great_circle

# Create lists for the new columns, initialized with None for all rows
closest_values_old = [None] * df_diff.height
closest_distances_old = [None] * df_diff.height
closest_values_all = [None] * df_diff.height
closest_distances_all = [None] * df_diff.height


# Filter for "Antigo" values
nova_df = df_diff.filter(pl.col("status") == "Nova")
antigo_df = df_diff.filter(pl.col("status") == "Antigas")

# Iterate through each "Antigo" row
for index, row in enumerate(nova_df.iter_rows(named=True)):
    current_coords = (row["LAT"], row["LONG"])
    # Calculate distances to other "Antigo" values
    distances_old = [
        (other_row["value"], great_circle(current_coords, (other_row["LAT"], other_row["LONG"])).kilometers)
        for other_row in antigo_df.iter_rows(named=True)
        if other_row["value"] != row["value"]  # Skip itself
    ]
    # Find the closest "Antigo" value
    if distances_old:
        closest_value_old, min_distance_old = min(distances_old, key=lambda x: x[1])
        closest_values_old[index] = closest_value_old  # Set in the original position
        closest_distances_old[index] = min_distance_old
    # No need to append since we already initialized with None

# Iterate through each "Antigo" row
for index, row in enumerate(df_diff.iter_rows(named=True)):
    current_coords = (row["LAT"], row["LONG"])
    # Calculate distances to other "Antigo" values
    distances_all = [
        (other_row["value"], great_circle(current_coords, (other_row["LAT"], other_row["LONG"])).kilometers)
        for other_row in df_diff.iter_rows(named=True)
        if other_row["value"] != row["value"]  # Skip itself
    ]
    # Find the closest "Antigo" value
    if distances_all:
        closest_value_all, min_distance_all = min(distances_all, key=lambda x: x[1])
        closest_values_all[index] = closest_value_all  # Set in the original position
        closest_distances_all[index] = min_distance_all
    # No need to append since we already initialized with None


# Add the new columns to the original DataFrame
df_diff = df_diff.with_columns([
    pl.Series("closest_value_old", closest_values_old),
    pl.Series("closest_distance_old", closest_distances_old)
])

df_diff = df_diff.with_columns([
    pl.Series("closest_value_all", closest_values_all),
    pl.Series("closest_distance_all", closest_distances_all)
])


df_diff.write_excel("lojasAssaiMenorDistancia_Test.xlsx", worksheet='lojasSite')



<xlsxwriter.workbook.Workbook at 0x19be88e42f0>

In [108]:
import pandas as pd
import plotly.express as px

df_diff = df_diff.with_columns(
    dummy_column_for_size = 1.
)
# Create the scatter mapbox
fig = px.scatter_mapbox(df_diff, 
                        lat="LAT", 
                        lon="LONG", 
                        hover_name="value",  # Shows when hovering over points
                        zoom=4,  # Zoom level
                        height=600,
                        
                        color='status',
                        size='dummy_column_for_size',
                        size_max=10,)

# Set mapbox style and access token (you can use 'open-street-map' without a token)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(mapbox_style="carto-positron")

# Make sure the map takes up the full width of the output area
fig.update_layout(
    autosize=True, # Set this to your preferred width (can be omitted if autosize works)
    height=1080,  # Set this to your preferred height
    margin={"r":0,"t":0,"l":0,"b":0}
)

# Show the interactive map
fig.show()
# Export the map as an HTML file
fig.write_html("localizacoesAssai.html")

# Optionally, export as an image (png, jpg, svg)
# fig.write_image("brazil_map_image.png", scale=3)