In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def get_weights(var, mean, curve_maximum=4, growth_rate=4, mean_importance=10):
        # This is adapted from the logistic function
        # https://en.wikipedia.org/wiki/Logistic_function
    return (curve_maximum * np.exp(mean_importance * mean)) / (1 + np.exp(-growth_rate * (1 - var)))

In [3]:
def normalize_col(df_col):
    return (df_col-df_col.min())/(df_col.max()-df_col.min())

In [4]:
def merge_portolio_ids_to_base(portfolio_ids, base):
    return portfolio_ids.merge(base, on='id', how='inner')

In [5]:
def get_leads(portfolio_ids, base, max_leads=None):
    
    portfolio = merge_portolio_ids_to_base(portfolio_ids, base)
    
    # Apply a inverse variance weight on each column before finding the similarities.
    # This way, we emphasize the features with little variance, which means our
    # client probably looks at those features to choose their next clients.
    # It's important to take the means into account. If there's some feature that is mostly
    # zero but have few occurences, it would have low variance, inflating the weight.
    weights = get_weights(normalize_col(portfolio.var()), normalize_col(portfolio.mean()))
    portfolio = portfolio.mul(weights)
    base = base.mul(weights)
    
    # 1 for equal vectors. -1 for completely different
    similarities = cosine_similarity(base, portfolio)
    
    # Free up some memory
    del portfolio
    
    # argsort with axis=None will return the numeric indexes of the flattened array in ascending order. 
    # The unravel will rebuild the indexes of the flattened array.
    ind = np.unravel_index(np.argsort(similarities, axis=None)[::-1], similarities.shape)
    
    # Since the shape of similarities is (n_rows_in_base, n_rows_in_portfolio),
    # it's sufficient to look at the first axis.
    
    # Sorting the indices by cosine_similarity, we'll check 10*n_rows in portfolio.
    # From those, group the duplicates and order by counts. That means we'll order
    # the indices by how many rows from the portfolio elected them as the best similarity.
    # The lesser leads we return, the more similar to the portfolio those leads are.
    unique, counts = np.unique(ind[0][:portfolio_ids.shape[0] * 10], return_counts=True)
    leads_indices = list()
    counts_total = 0
    for count, idx in sorted(zip(counts, unique), reverse=True):
        leads_indices.append(idx)
        counts_total += count
        if counts_total >= portfolio_ids.shape[0] * 10:
            break

    leads = base.index.values[leads_indices]
    leads = leads[np.isin(leads, portfolio_ids.index.values, invert=True)]
    
    leads = pd.DataFrame({'id': leads}).set_index('id')
    if max_leads:
        return leads[:max_leads]
    
    return leads

In [6]:
def get_leads_metrics(leads, portfolio):
    # Do a split in the original portfolio and pass the validation portion here
    
    # How many (%) leads are actually in the portfolio?
    precision = np.isin(portfolio.index.values, leads.index.values).sum() / portfolio.index.values.shape[0]
    recall = np.isin(portfolio.index.values, leads.index.values).sum() / leads.index.values.shape[0]
    f1_score = 2*precision*recall / (precision + recall)
    return precision, recall, f1_score

In [7]:
def get_portfolio_ids(filename):
    return pd.read_csv(filename, index_col='id', usecols=['id'])

In [8]:
def test_portfolio(filename):
    port = get_portfolio_ids(filename)
    port_train, port_valid = train_test_split(port, test_size=0.3)
    port_leads = get_leads(port_train, df)
    precision, recall, f1_score = get_leads_metrics(port_leads, port_valid)
    print(f'{precision*100:.2f}% dos leads estão no conjunto de teste no {filename}.')
    print(f'recall: {recall*100:.2f}% e f1-score: {f1_score*100:.2f}%.')
    print()
    return port_leads

In [9]:
df = pd.read_csv('data/estaticos_market_clean.csv', index_col='id')
df.head()

Unnamed: 0_level_0,fl_matriz,idade_empresa_anos,fl_me,fl_sa,fl_epp,fl_mei,fl_ltda,fl_st_especial,fl_rm,fl_spa,...,de_ramo-SIDERURGICA-METALURGIA,de_ramo-TELECOM,de_ramo-TEXTEIS,"de_ramo-TRANSPORTE, ARMAZENAGEM E CORREIO",setor-AGROPECUARIA,setor-COMERCIO,setor-CONSTRUÇÃO CIVIL,setor-INDUSTRIA,setor-OUTROS,setor-SERVIÇO
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a6984c3ae395090e3bee8ad63c3758b110de096d5d819583a784a113726db849,1,0.584005,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
6178f41ade1365e44bc2c46654c2c8c0eaae27dcb476c47fdef50b33f4f56f05,1,0.189909,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4a7e5069a397f12fdd7fd57111d6dc5d3ba558958efc02edc5147bc2a2535b08,1,0.445164,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
3348900fe63216a439d2e5238c79ddd46ede454df7b9d8c24ac33eb21d4b21ef,1,0.429185,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1f9bcabc9d3173c1fe769899e4fac14b053037b953a1e4b102c769f7611ab29f,1,0.304422,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# curve max = 4, growth = 4, mean_importance = 10
leads1_weighted = test_portfolio('data/estaticos_portfolio1.csv');
leads2_weighted = test_portfolio('data/estaticos_portfolio2.csv');
leads3_weighted = test_portfolio('data/estaticos_portfolio3.csv');

print('Quantidade de leads de cada portfólio:', len(leads1_weighted), len(leads2_weighted), len(leads3_weighted))

2.40% do conjunto de teste já está presente no portfólio data/estaticos_portfolio1.csv.
recall: 0.12% e f1-score: 0.22%.
63.53% do conjunto de teste já está presente no portfólio data/estaticos_portfolio2.csv.
recall: 11.27% e f1-score: 19.15%.
65.00% do conjunto de teste já está presente no portfólio data/estaticos_portfolio3.csv.
recall: 11.87% e f1-score: 20.08%.
Quantidade de leads de cada portfólio: 3403 958 438


Gerando os leads com o portfólio completo para posterior visualização.

In [12]:
filename_base = 'data/estaticos_portfolio{}.csv'
for i in range(1, 4):
    port = get_portfolio_ids(filename_base.format(i))
    leads = get_leads(port, df)
    
    port.to_csv(f'data/port_ids{i}.csv')
    leads.to_csv(f'data/leads_ids{i}.csv')

Comparando com a previsão sem aplicar pesos "personalizados":

In [13]:
def get_weights(var, mean, curve_maximum=2, growth_rate=0, mean_importance=0):
        # This is adapted from the logistic function
        # https://en.wikipedia.org/wiki/Logistic_function
    return (curve_maximum * np.exp(mean_importance * mean)) / (1 + np.exp(-growth_rate * (1-var)))

In [14]:
# curve max = 2, growth = 0, mean_importance = 0
# equivalent to no weights
leads1_notweighted = test_portfolio('data/estaticos_portfolio1.csv');
leads2_notweighted = test_portfolio('data/estaticos_portfolio2.csv');
leads3_notweighted = test_portfolio('data/estaticos_portfolio3.csv');

print('Quantidade de leads de cada portfólio:', len(leads1_notweighted), len(leads2_notweighted), len(leads3_notweighted))

2.99% do conjunto de teste já está presente no portfólio data/estaticos_portfolio1.csv.
recall: 0.15% e f1-score: 0.28%.
56.47% do conjunto de teste já está presente no portfólio data/estaticos_portfolio2.csv.
recall: 6.21% e f1-score: 11.18%.
50.00% do conjunto de teste já está presente no portfólio data/estaticos_portfolio3.csv.
recall: 8.93% e f1-score: 15.15%.
Quantidade de leads de cada portfólio: 3440 1547 448


## Visualizando os leads

Seguindo a ideia do preprocessamento, uma vez gerados os leads podemos trabalhar com eles em [outro notebook](visualization.ipynb)