In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from pathlib import Path


# En Jupyter, usar el directorio de trabajo actual
BASE_DIR = Path.cwd().parent
# Para archivos en el mismo directorio
account = BASE_DIR / "data.csv"

# Para archivos en subdirectorios
data_file = BASE_DIR / "data" / "raw" / "accounts.csv"


def read_data():

    accounts_test = pd.read_csv(f'{BASE_DIR}/data/accounts_test.csv',sep=',')
    accounts_train = pd.read_csv(f'{BASE_DIR}/data/accounts_train.csv',sep=',')
    
    quotes_test = pd.read_csv(f'{BASE_DIR}/data/quotes_test.csv',sep=',')
    quotes_train = pd.read_csv(f'{BASE_DIR}/data/quotes_train.csv',sep=',')

    return accounts_train,accounts_test, quotes_train, accounts_train

accounts_train,accounts_test, quotes_train, accounts_train = read_data()

In [4]:
quotes_agg = quotes_train.groupby('account_uuid').agg({
    'premium': lambda x: x[quotes_train.loc[x.index, 'convert'] == 1].sum(),
    'product': 'count',  # total de productos cotizados
})

quotes_agg.reset_index(inplace=True)
quotes_agg.columns = ['account_uuid','account_value', 'total_quoted_products']

data = accounts_train.merge(quotes_agg, on='account_uuid', how='left')

In [5]:
dic_states = {    'California': 'CA',
    'New York': 'NY',
    'PA - Pennsylvania': 'PA',
    'Washington DC': 'DC',
    'Oregon': 'OR',
    'Florida': 'FL'}


data['state'] = data['state'].map(dic_states).fillna(data['state'])

In [9]:
data['state'].unique()

array(['PA', 'TX', 'NY', 'CA', 'AZ', 'FL', 'GA', 'NC', 'VA', 'OH', 'AR',
       'IN', 'AL', 'MI', 'NJ', 'MO', 'MD', 'UT', 'KY', 'LA', 'WA', 'CO',
       'SC', 'IL', 'MS', 'WI', 'DE', 'OK', 'MA', 'ME', 'NH', 'ID', nan,
       'MT', 'WV', 'TN', 'OR', 'CT', 'MN', 'RI', 'NV', 'KS', 'NE', 'NM',
       'DC', 'VT', 'SD', 'IA', 'WY', 'ND', 'HI', 'AK'], dtype=object)

In [10]:
data['industry'].unique()

array(['Retail Trade', 'Contractors', nan,
       'Professional, Scientific and Technical Services', 'Non Profits',
       'Transportation and Warehousing', 'Wholesale Trade', 'Education',
       'Healthcare', 'Other Services', 'Consultants',
       'Administrative Services and Building Maintenance',
       'Food and Accommodation', 'Real Estate', 'Manufacturing',
       'Construction', 'Finance and Insurance',
       'Technology, Media and Telecommunications',
       'Sports, Arts, Entertainment, and Recreation',
       'Agriculture, Forestry, Fishing and Hunting',
       'Rentals and Leasing', 'Home Based Business'], dtype=object)

In [11]:
data['business_structure'].unique()

array(['Limited Liability Company', 'Corporation', 'Individual',
       'Non-Profit', 'Partnership', 'Limited Partnership', 'Other',
       'Not sure yet', 'Trust', nan], dtype=object)

In [None]:

# Let's fill state, and business_structure with the mode. We have low number (<1%) of nulls here and the impact would be minimal no matter what approach we take.
# Besides, we don't have any other information to make a better assumption, and in this case a percentage this low can be due to error in the data provided so we are not introducing too much bias in the data by imputing the mode.


data['state'].fillna(data['state'].mode()[0], inplace=True) 

# We have a category called "Not sure yet", we can imput this value
data['business_structure'].fillna('Not sure yet', inplace=True)

# Now for the industry and subindustry, since the perentage of nulls is higher (around 3.4%) we can create a new category 'Unknown' to fill the nulls, so we don't lose the information that this data was missing and we don't introduce too much bias in the data.
data['industry'].fillna('Unknown', inplace=True)
data['subindustry'].fillna('Unknown', inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['state'].fillna(data['state'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['business_structure'].fillna(data['business_structure'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because th

In [15]:
# Now let's analyze the num_employees. In this case, depending on the business_structure we can have a better idea of the number of employees. For example, if the business_structure is 'Individual' 
# it is very likely that the number of employees is 0 or 1. On the other hand, if the business_structure is 'Corporation' it is more likely that the number of employees is higher.
median_by_structure = data.groupby(['business_structure'])['num_employees'].median()


median_by_structure

business_structure
Corporation                  1.0
Individual                   0.0
Limited Liability Company    0.0
Limited Partnership          0.0
Non-Profit                   0.0
Not sure yet                 0.0
Other                        0.0
Partnership                  0.0
Trust                        0.0
Name: num_employees, dtype: float64

In [25]:
# Let's also consider the industry, since some industries are more likely to have a higher number of employees than others.
# For example, a 'Retail' business is more likely to have a higher number of employees than a 'Consulting' business, even if both are 'Corporation'.

# Calculate global median first


cols = ['num_employees', 'total_payroll', 'annual_revenue','year_established']

for col in cols:
    global_median = data[col].median()
    data[col] = data.groupby(['state','business_structure','industry'])[col].transform(
        lambda x: x.fillna(x.median() if not pd.isna(x.median()) else global_median)
    )



  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [26]:
null_stats = pd.DataFrame({

    'Nulos': data.isnull().sum(),
    '% Nulos': (data.isnull().sum() / len(data)) * 100,
    'Tipo': data.dtypes,
    'Únicos': data.nunique()
})

null_stats
# We got null values in every comlumn except account_uuid, account_value and total_quoted_products.

Unnamed: 0,Nulos,% Nulos,Tipo,Únicos
account_uuid,0,0.0,object,5709
state,0,0.0,object,57
industry,0,0.0,object,22
subindustry,0,0.0,object,483
year_established,0,0.0,float64,69
annual_revenue,0,0.0,float64,313
total_payroll,0,0.0,float64,268
business_structure,0,0.0,object,9
num_employees,0,0.0,float64,37
account_value,0,0.0,float64,2664


In [28]:
data = data.drop(columns=['total_quoted_products','account_value'])

data.to_csv(f'{BASE_DIR}/data/accounts_train_proccessed.csv', index=False)