In [None]:
from glob import glob
import pandas as pd
import json

In [None]:
def get_car_data(id_car, df):
    return df[df['ID_Auto'] == id_car]

def get_last_file(path: str, ext: str):
    filenames = glob(f"{path}/*{ext}")
    filenames.sort()
    return filenames[-1]

In [None]:
def add_cars_names(df_fin, kavak_cards_json_path):
    #Diccionario {'auto_id':'Nombre del auto'}
    autos_dict = {}
    with open(kavak_cards_json_path, 'r', encoding='utf-8') as f: #TODO Cambiar las rutas
        for line in f:
            car = json.loads(line)
            name = car['slug'].split('/')[-1].replace('-', ' ').title()
            autos_dict[car['id']] = name

    #VLOOKUP by ID
    df_fin['Nombre'] = df_fin['ID_Auto'].astype(str).map(autos_dict)

    # Separate name into ['Marca', 'Modelo', 'Version', 'Tipo', 'Año']
    df_fin[['Marca', 'Modelo', 'Version', 'Tipo', 'Año']] = (
        df_fin['Nombre']
        .str.strip()
        .str.split(' ', expand=True)
    )
    df_fin = df_fin.drop(columns=['Nombre'])

    return df_fin

In [None]:
def financial_data_calculation(df_financial):
    # Calculo de datos financieros
    df_financial['Total_a_Pagar'] = (df_financial['Mensualidad'] * df_financial['Plazo']) + df_financial['Enganche_Simulado'] # Plazo * Mensualidad + enganche 

    df_financial['Interes'] = df_financial['Total_a_Pagar'] - df_financial['Precio'] # Costo extra
    df_financial['Interes_%'] = (df_financial['Interes'] * 100) / df_financial['Precio'] # % de costo extra respecto al precio  

    df_financial['Enganche_Min_%'] = (df_financial['Enganche_Min'] * 100) / df_financial['Precio']
    df_financial['Enganche_Max_%'] = (df_financial['Enganche_Max'] * 100) / df_financial['Precio']

    return df_financial

In [None]:
def missing_data_integration(json_kavak_cards, df, df_prod):
    ### Integracion de datos faltantes (km, sucursal, caja, oferta)

    #df_prod = pd.read_json(json_kavak_cards, lines=True)

    df_prod['id'] = df_prod['id'].astype('string')
    df_prod = df_prod.drop(columns=['slug', 'price', 'details']) # Elimina cols innecesarias
    df_prod.rename(columns={'id':'ID_Auto', 'year':'Año', 'city':'Ciudad', 'gear':'Caja', 'km':'Km', 'discount_offer':'Oferta'}, inplace=True)

    df_merged = df.merge(
        df_prod, 
        how='inner', 
        on='ID_Auto', 
        suffixes=('_fin', '_prod')
        )

    if (df_merged[df_merged['Año_fin'] != df_merged['Año_prod']]).empty:
        #print("Los datos son identicos. Eliminando...")
        df_merged.rename(columns={'Año_prod':'Año', 'Tipo':'Tipo'}, inplace=True)
        df_merged = df_merged.drop(columns=['Año_fin'])
    
    return df_merged

In [None]:
def main():
    try:    
        financial_csv_path = get_last_file('../data/processed/csv/financial_data/', '.csv')
        kavak_cards_json_path = get_last_file('../data/processed/json/', '.jsonl')
    except Exception as e:
        print("No se lograron localizar los archivos necesarios")


    df_auto_cards = pd.read_json(kavak_cards_json_path, encoding='utf-8')
    dtypes = {
        #ID_Auto,Precio,Tasa_Servicio,Plazo,Mensualidad,Tasa_Interes,Seguro,Enganche_Simulado,Enganche_Min,Enganche_Max
        'ID_Auto': 'string',
        'Precio': 'float32', 
        'Tasa_Servicio': 'float32', 
        'Plazo': 'Int16',         
        'Mensualidad': 'Int32',   
        'Tasa_Interes': 'float32',       
        'Seguro': 'float32', 
        'Enganche_Simulado': 'float32', 
        'Enganche_Min': 'float32', 
        'Enganche_Max': 'float32', 
    }

    financial_cols_to_load = list(dtypes.keys())

    # Dataframe Financiero resultado del Enricher.
    df_financial = pd.read_csv(financial_csv_path, encoding='utf-8', usecols=lambda c: c in financial_cols_to_load, dtype=dtypes)

    df_financial = add_cars_names(df_financial, df_auto_cards)
    df_financial = financial_data_calculation(df_financial)

    # Casting to car information
    df_financial['Marca'] = df_financial['Marca'].astype('category')
    df_financial['Modelo'] = df_financial['Modelo'].astype('string')
    df_financial['Version'] = df_financial['Version'].astype('string')
    df_financial['Año'] = df_financial['Año'].astype('Int64')
    df_financial['Tipo'] = df_financial['Tipo'].astype('category')

    df_merged = missing_data_integration(kavak_cards_json_path, df_financial, df_auto_cards)

    # Casting to 
    df_merged['Ciudad'] = df_merged['Ciudad'].astype('category')
    df_merged['Año'] = df_merged['Año'].astype('Int64')
    df_merged['Caja'] = df_merged['Caja'].astype('category')
    df_merged['Km'] = df_merged['Km'].astype('Int64')
    df_merged['Oferta'] = df_merged['Oferta'].astype('category')

    df_merged.to_csv('../data/csv/cleaned_final_csv_scrap_completo.csv', encoding='utf-8', index=False)

In [None]:
if __name__ == '__main__':
    main()