In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from pathlib import Path


# En Jupyter, usar el directorio de trabajo actual
BASE_DIR = Path.cwd().parent
# Para archivos en el mismo directorio
account = BASE_DIR / "data.csv"

# Para archivos en subdirectorios
data_file = BASE_DIR / "data" / "raw" / "accounts.csv"


def read_data():

    accounts_test = pd.read_csv(f'{BASE_DIR}/data/accounts_test.csv',sep=',')
    accounts_train = pd.read_csv(f'{BASE_DIR}/data/accounts_train.csv',sep=',')
    
    quotes_test = pd.read_csv(f'{BASE_DIR}/data/quotes_test.csv',sep=',')
    quotes_train = pd.read_csv(f'{BASE_DIR}/data/quotes_train.csv',sep=',')

    return accounts_train,accounts_test, quotes_train, accounts_train

accounts_train,accounts_test, quotes_train, accounts_train = read_data()

In [17]:
duplicated = quotes_train.groupby('account_uuid').agg(counts=('account_uuid','count')).reset_index()
duplicated_uuid = list(duplicated[duplicated['counts']>1]['account_uuid'].values)

In [18]:
quotes_agg = quotes_train.groupby('account_uuid').agg({
    'premium': lambda x: x[quotes_train.loc[x.index, 'convert'] == 1].sum(),
    'product': 'count',  # total de productos cotizados
    'convert': 'sum'     # total de productos comprados
})

quotes_agg.reset_index(inplace=True)
quotes_agg.columns = ['account_uuid','account_value', 'total_quoted_products', 'total_converted_products']

In [19]:
data = quotes_agg.merge(accounts_train, on='account_uuid', how='left')

In [20]:
# Crear features de alto nivel de industry
def simplify_industry(industry):
    if pd.isna(industry):
        return 'Unknown'
    elif 'retail' in industry.lower():
        return 'Retail'
    elif 'professional' in industry.lower() or 'technical' in industry.lower():
        return 'Professional_Services'
    elif 'food' in industry.lower() or 'restaurant' in industry.lower():
        return 'Food_Service'
    elif 'construction' in industry.lower() or 'contractor' in industry.lower():
        return 'Construction'
    elif 'manufacturing' in industry.lower():
        return 'Manufacturing'
    else:
        return 'Other'


In [21]:
def preprocess_categorical_features(data):
    """
    Preprocessing completo de features categóricas
    """
    data = data.copy()
    
    # 1. Simplificar industry
    data['industry_simplified'] = data['industry'].apply(simplify_industry)
    
    # 2. Simplificar business_structure
    top_structures = data['business_structure'].value_counts().head(4).index
    data['business_structure_simplified'] = data['business_structure'].apply(
        lambda x: x if x in top_structures else 'Other'
    )
    
    # 3. One-hot encoding
    categorical_cols = ['state', 'industry_simplified', 'business_structure_simplified']
    
    for col in categorical_cols:
        dummies = pd.get_dummies(data[col], prefix=col, drop_first=True)
        data = pd.concat([data, dummies], axis=1)
    
    # 4. Target encoding para subindustry (solo si tienes target)
    if 'account_value' in data.columns:
        subindustry_means = data.groupby('subindustry')['account_value'].mean()
        data['subindustry_target_encoded'] = data['subindustry'].map(subindustry_means)
        data['subindustry_target_encoded'].fillna(data['account_value'].mean(), inplace=True)
    
    # 5. Features derivadas
    data['business_age'] = 2024 - data['year_established']
    data['revenue_per_employee'] = data['annual_revenue'] / (data['num_employees'] + 1)
    data['payroll_ratio'] = data['total_payroll'] / (data['annual_revenue'] + 1)
    
    # 6. Features booleanas
    data['is_mature_business'] = (data['business_age'] >= 5).astype(int)
    data['has_employees'] = (data['num_employees'] > 0).astype(int)
    data['has_payroll'] = (data['total_payroll'] > 0).astype(int)
    
    # 7. Eliminar columnas originales categóricas
    cols_to_drop = ['industry', 'subindustry', 'business_structure'] + categorical_cols
    data.drop(columns=[col for col in cols_to_drop if col in data.columns], inplace=True)
    
    return data

# Aplicar preprocessing
data_processed = preprocess_categorical_features(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['subindustry_target_encoded'].fillna(data['account_value'].mean(), inplace=True)


In [22]:
# Opción 3: Función de preprocessing
def convert_booleans_to_int(df):
    """Convierte todas las columnas booleanas a int"""
    df = df.copy()
    bool_cols = df.select_dtypes(include=['bool']).columns
    for col in bool_cols:
        df[col] = df[col].astype(int)
    return df

data_processed = convert_booleans_to_int(data_processed )

In [23]:
data_processed

Unnamed: 0,account_uuid,account_value,total_quoted_products,total_converted_products,year_established,annual_revenue,total_payroll,num_employees,state_AL,state_AR,...,business_structure_simplified_Limited Liability Company,business_structure_simplified_Non-Profit,business_structure_simplified_Other,subindustry_target_encoded,business_age,revenue_per_employee,payroll_ratio,is_mature_business,has_employees,has_payroll
0,000f56d0-da32c-68f2-e611e-5de77846f8,3296.84,4,2,2010.0,100000.0,0.0,2.0,0,0,...,1,0,0,3296.840000,14.0,33333.333333,0.000000,1,1,0
1,00118ffe-d450f-c2ca-4ba15-d1eec8e54a,475.00,1,1,2017.0,25000.0,25000.0,0.0,0,0,...,1,0,0,1298.212857,7.0,25000.000000,0.999960,1,0,1
2,0012a66e-7171-4f07-ac42-029f88fb4d2c,2022.37,4,1,2016.0,150000.0,100000.0,3.0,0,0,...,1,0,0,777.421250,8.0,37500.000000,0.666662,1,1,1
3,00287d05-bb1b4-d1ab-08441-d6d09641c4,799.07,1,1,2015.0,75000.0,14400.0,1.0,0,0,...,1,0,0,1673.321805,9.0,37500.000000,0.191997,1,1,1
4,0051ff1a-8b63-4a6f-b96c-81a21ba815de,452.50,1,1,2007.0,30000.0,2000.0,1.0,0,0,...,0,0,0,1618.555000,17.0,15000.000000,0.066664,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5704,ffe20bd9-e1e48-45d9-ccbbd-22b89b3ff4,650.00,1,1,2017.0,50000.0,0.0,0.0,0,0,...,0,0,0,482.348333,7.0,50000.000000,0.000000,1,0,0
5705,ffe981b3-db743-1ff3-1217a-88fd042cbc,862.24,1,1,2017.0,65000.0,5000.0,1.0,0,0,...,0,0,0,1718.106277,7.0,32500.000000,0.076922,1,1,1
5706,ffecef7a-db75-44ed-a07d-61847c69a6e5,150.00,2,1,2018.0,18000.0,0.0,0.0,0,0,...,1,0,0,1081.946093,6.0,18000.000000,0.000000,1,0,0
5707,fff98577-def8-4ccb-9c9c-29a54b294739,616.00,1,1,2013.0,50000.0,0.0,0.0,0,0,...,0,0,0,1168.837500,11.0,50000.000000,0.000000,1,0,0


In [15]:
data_processed.to_csv(f'{BASE_DIR}/data/features.csv', index=False)