In [0]:
import pandas as pd 
import numpy as np
from tqdm import tqdm_notebook
import gc
import pickle
from catboost import CatBoostClassifier

## Features from categorical variables

In [0]:
cat_cols = ['main_okved', 'region_code']
cat_cols = {x: str for x in cat_cols} 
cat_cols['id'] = str
vertices = pd.read_csv('vertices.csv', index_col=0,
                       dtype=cat_cols)
edges = pd.read_csv('edges.csv', dtype={'id_1': str, 'id_2': str})
ids = pd.read_csv('ids.csv', dtype={'id':str})

In [0]:
def prepare_okved(x):
    l = x.split('.')
    if len(l) == 1:
        return x + '.-1'
    if len(l[1]) == 0:
        return x + '-1'
    else:
        return x

vertices.main_okved = vertices.main_okved.apply(prepare_okved)

In [0]:
vertices['okved_first'] = vertices.main_okved.apply(lambda x: x.split('.')[0])

In [0]:
vertices['okved_second'] = vertices.main_okved.apply(lambda x: x.split('.')[0] +'.'+ x.split('.')[1][0])

In [0]:
cols = ['region_code', 'company_type']
bases = ['main_okved', 'okved_first', 'okved_second']
for base in bases:
    for col in cols:
        vertices[base + '_' + col] = vertices[base] + '_' + vertices[col]
        vertices[cols[0] + '_' + cols[1] + '_' + base] = vertices[base] + '_' + vertices[cols[0]] + '_' + vertices[cols[1]]

In [0]:
vertices['region_code' + '_' + 'company_type'] = vertices[cols[0]] + '_' + vertices[cols[1]]

In [0]:
vertices.to_csv('new_vertices.csv')

## Combination of SVD matrix

In [0]:
transformed450 = np.load('transformed450R20_2.npy')
components450 = np.load('components450R20_2.npy')
transformed400 = np.load('transformed400.npy')
transformed400 = np.load('components400.npy')
transformed475 = np.load('transformed475R25.npy')
components475 = np.load('components475R25.npy')

In [0]:
def transform_svd_matr(transformed, components):
    j = 0
    svd = np.zeros((100, len(vertices)))
    for id in ids.id:
        matr_id = int(id) - 1
        svd[j] = transformed[matr_id]@components 
        j += 1
    return svd

In [0]:
svd450 = transform_svd_matr(transformed450, components450)
svd475 = transform_svd_matr(transformed475, components475)
svd400 = transform_svd_matr(transformed400, components400)

In [0]:
svd_sum = svd400 + svd450 + svd475
np.save('svd_sum.npy', svd_sum)

## Mean encoding of categorical features

In [0]:
edges['value_on_n_trans'] = edges.value / edges.n_transactions
edges_concat = pd.concat([edges, edges[['id_2', 'id_1', 'value', 'n_transactions', 'value_on_n_trans']].rename(
    {'id_2': 'id_1', 'id_1': 'id_2'}, axis=1)]
)

In [0]:
cat_features = vertices.set_index('id').columns[vertices.set_index('id').dtypes == np.object]
merged_data = vertices.rename({'id': 'id_1'}, axis=1).merge(edges_concat, on='id_1')
global_means = merged_data[['value', 'n_transactions', 'value_on_n_trans']].mean()

In [0]:
for feature in cat_features:
    if feature == 'okved_info':
    continue
    nrows = len(merged_data)
    global_mean = pd.DataFrame()
    tmp_df = merged_data[[feature, 'value', 'n_transactions', 'value_on_n_trans']]
    grouped_by =  tmp_df.groupby(feature)
    count = grouped_by.agg({'value' : 'count'})
    count = count.reset_index().value
    grouped_by = grouped_by.agg('mean')
    alpha = nrows / (10*merged_data[feature].nunique())
    grouped_by['value'] =(grouped_by['value'].values*count.values + global_means['value']*alpha)/(count.values+alpha)
    grouped_by['n_transactions']  = (grouped_by['n_transactions'].values*count.values + global_means['n_transactions']*alpha)/(count.values+alpha)
    grouped_by['value_on_n_trans'] = (grouped_by['value_on_n_trans'].values*count.values + global_means['value_on_n_trans']*alpha)/(count.values+alpha)

    grouped_by = grouped_by.rename(lambda x: feature + '_' + x + '_mean', axis=1)\
                .reset_index()
    vertices = vertices.merge(grouped_by, on=feature, how='left')
    vertices[feature + '_' + 'value' + '_mean'] = vertices[feature + '_' + 'value' + '_mean'].fillna(global_means['value'])
    vertices[feature + '_' + 'n_transactions' + '_mean'] = vertices[feature + '_' + 'n_transactions' + '_mean'].fillna(global_means['n_transactions'])
    vertices[feature + '_' + 'value_on_n_trans' + '_mean'] = vertices[feature + '_' + 'value_on_n_trans' + '_mean'].fillna(global_means['value_on_n_trans'])


In [0]:
vertices.to_csv('me_vertices.csv')

## Adding features from Node2Vec embeddings

In [0]:
with open('embs.pickle', 'rb') as f:
    node_dict = pickle.load(f)

In [0]:
vertices_nodes = np.empty((len(vertices), 128))
vertices_nodes[:] = np.nan
for id in vertices.id:
    matr_id = int(id) - 1 
    if id in node_dict:
        vertices_nodes[matr_id] = node_dict[id]

In [0]:
vertices = pd.concat([vertices, vertices_nodes], axis=1)

In [0]:
vertices.to_csv('vertices_nodes.csv')

## Training 
## Grouping variables (light)

In [0]:
grouped = edges_concat.groupby('id_1').agg({
    'id_2': 'count',
    'value': ['mean', 'sum'],
    'n_transactions':  ['mean', 'sum'],
    'value_on_n_trans': ['mean', 'sum']
})

In [0]:
grouped.columns = ['g_count', 'g_val_mean', 'g_val_sum', 'g_ntr_mean', 'g_ntr_sum', 'g_vot_mean', 'g_vot_sum']

In [0]:
grouped = grouped.reset_index().rename({'id_1': 'id'}, axis=1).set_index('id')

Stacking

In [0]:
vertices = vertices.merge(grouped, how='left', on='id')

In [0]:
vertices = vertices.set_index('id')

In [0]:
cat_features = vertices.columns[vertices.dtypes==np.object]
node_features = [col for col in vertices if col.startswith('node_f_')]
mn_enc_features = [col for col in vertices if col.endswith('_mean') and not col.startswith('g_')]
g_features = [col for col in vertices if col.startswith('g_')]

In [0]:
vertices = vertices.reset_index()

In [0]:
def create_probas(vertices, cat_features=None): 
    j = 0 
    probas = np.zeros((100, len(vertices)))
    for i in tqdm_notebook(ids.id):

        df1 = edges[edges['id_1'] == i].reset_index()
        df2 = edges[edges['id_2'] == i].reset_index()

        df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
        df['target'] = 1
        
        df = vertices.set_index('id').join(df.set_index('id_1')['target']).fillna(0)
        X = df.drop(['target'], axis=1)
        y = df['target']
        if cat_features is None:
            model = CatBoostClassifier(iterations=100, verbose=False, task_type='GPU',
                                    class_weights=[1, (y==0).sum()/(y==1).sum()])
        else:
            model = CatBoostClassifier(iterations=100, verbose=False, task_type='GPU',
                                    class_weights=[1, (y==0).sum()/(y==1).sum()], cat_features=cat_features)          
        
        model.fit(X, y)
        preds = model.predict_proba(X)[:,1]
        probas[j] = preds
        j += 1
    return probas

In [0]:
grouped_probas = create_probas(vertices.set_index('id')[g_features].reset_index())
cat_probas = create_probas(vertices.set_index('id')[cat_features].reset_index(), cat_features)
node_probas = create_probas(vertices.set_index('id')[node_features].reset_index())
mn_enc_probas = create_probas(vertices.set_index('id')[mn_enc_features].reset_index())

In [0]:
j = 0 
result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
  
    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1
    
    df = vertices.set_index('id').join(df.set_index('id_1')['target']).fillna(0)
    X = pd.DataFrame(df.reset_index()['id']).set_index('id')

    X['cat_probas'] = cat_probas[j]
    X['node_probas'] = node_probas[j]
    X['mn_enc_probas'] = mn_enc_probas[j]
    X['grouped_probas'] = grouped_probas[j]

    y = df['target']
    model = CatBoostClassifier(iterations=50, verbose=False, task_type='GPU',
                               class_weights=[1, (y==0).sum()/(y==1).sum()],
                               )
    
    model.fit(X, y)

    preds = model.predict_proba(X)[:,1]
    tmp = pd.DataFrame({
        'id_1': [i]*len(preds),
        'id_2': vertices.id,
        'proba': preds
    })

    tmp = tmp[df.reset_index().target != 1].sort_values(by='proba', ascending=False)[:800]
    result = pd.concat([result, tmp])

    j += 1
    gc.collect()

In [0]:
edges_concat['ss'] = edges_concat.id_1 + '_' + edges_concat.id_2

idx = result.id_1 < result.id_2
result['ss'] = np.zeros(len(result)).astype(str)
result.loc[idx, 'ss'] =  result.id_1[idx] + '_' + result.id_2[idx]
result.loc[~idx, 'ss'] =  result.id_2[~idx] + '_' + result.id_1[~idx]
result = result.drop_duplicates(subset='ss')
idx = result['ss'].isin(edges_concat.ss)
result = result[~idx]

In [0]:
result.to_csv('result_stacking.csv')

## Getting edges from SVD

In [0]:
j = 0 
result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
for i in tqdm_notebook(ids.id):
    preds = svd_sum[j] 

    tmp = pd.DataFrame({
        'id_1': [i]*len(preds),
        'id_2': vertices.id,
        'proba': preds
    })

    tmp = tmp.sort_values(by='proba', ascending=False)[:2300]
    result = pd.concat([result, tmp])
    j += 1
    gc.collect()

In [0]:
idx = result.id_1 < result.id_2
result['ss'] = np.zeros(len(result)).astype(str)
result.loc[idx, 'ss'] =  result.id_1[idx] + '_' + result.id_2[idx]
result.loc[~idx, 'ss'] =  result.id_2[~idx] + '_' + result.id_1[~idx]
result = result.drop_duplicates(subset='ss')
idx = result['ss'].isin(edges_concat.ss)
result = result[~idx]

In [0]:
result.to_csv('result_svd.csv')

## Getting edges for nodes with 0 connections

In [0]:
cat_cols = ['main_okved', 'region_code', 'okved_first', 'okved_second', 'main_okved_region_code',
            'okved_first_region_code', 'okved_second_region_code']
cat_cols = {x: str for x in cat_cols} 
cat_cols['id'] = str
vertices = pd.read_csv('me_vertices.csv',
                          dtype=cat_cols)

In [0]:
lone_ids = vertices.id[~vertices.id.isin(edges_concat.id_2)]

In [0]:
cat_features = vertices.set_index('id').columns[vertices.set_index('id').dtypes == np.object]

In [0]:
result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])
j = 0

for i in tqdm_notebook(ids.id):

    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1
    
    df = vertices.set_index('id').join(df.set_index('id_1')['target']).fillna(0)
    X = df.drop(['target'], axis=1)
    y = df['target']
    model = CatBoostClassifier(iterations=100, verbose=False, task_type='GPU',
                               class_weights=[1, (y==0).sum()/(y==1).sum()], 
                               cat_features=cat_features)
    
    model.fit(X, y)
    preds = model.predict_proba(X)[:,1]

    tmp = pd.DataFrame({
        'id_1': [i]*len(preds),
        'id_2': vertices.id,
        'proba': preds
    })
    tmp = tmp[tmp.id_2.isin(lone_ids)]
    tmp = tmp.sort_values(by='proba', ascending=False)[:3000]
    result = pd.concat([result, tmp])
    j += 1

In [0]:
idx = result.id_1 < result.id_2
result['ss'] = np.zeros(len(result)).astype(str)
result.loc[idx, 'ss'] =  result.id_1[idx] + '_' + result.id_2[idx]
result.loc[~idx, 'ss'] =  result.id_2[~idx] + '_' + result.id_1[~idx]
result = result.drop_duplicates(subset='ss')
idx = result['ss'].isin(edges_concat.ss)
result = result[~idx]

In [0]:
result.to_csv('result_zeros.csv')

## Grouping features (hard)

In [0]:
cat_cols = ['main_okved', 'region_code', 'okved_first', 'okved_second', 'main_okved_region_code',
            'okved_first_region_code', 'okved_second_region_code']
cat_cols = {x: str for x in cat_cols} 
cat_cols['id'] = str
vertices = pd.read_csv('new_vertices.csv',
                          dtype=cat_cols)

In [0]:
grouped_features = edges_concat.groupby('id_1').agg(
    {
        'id_2': 'count',
        'value': ['sum', 'min', 'max', 'count', 'var'],
        'n_transactions': ['sum', 'min', 'max', 'count', 'var'],
        'value_on_n_trans': ['sum', 'min', 'max', 'count', 'var']
    }
)

In [0]:
new_columns = []
for col in grouped_features.columns:
    new_columns.append('g_'+ col[0]+ '_' + col[1])
grouped_features.columns = new_columns


In [0]:
grouped_features = pd.concat([
           grouped_features,
           grouped_features.apply(lambda x: x**2).rename(lambda x: x + '_sqr', axis=1),
            grouped_features.apply(lambda x: x**0.5).rename(lambda x: x + '_sqrt', axis=1),
           grouped_features.apply(np.log).rename(lambda x: x + '_ln', axis=1)], axis=1)

In [0]:
cat_features = vertices.set_index('id').columns[vertices.set_index('id').dtypes == np.object]

In [0]:
grouped_features = grouped_features.fillna(-10000)
grouped_features = grouped_features.reset_index().rename({'id_1': 'id'}, axis=1)
vertices = vertices[['id'] + list(cat_features)].merge(grouped_features, on='id', how='left').fillna(0)

In [0]:
result = pd.DataFrame(columns=['id_1', 'id_2', 'proba'])

for i in tqdm_notebook(ids.id):

    df1 = edges[edges['id_1'] == i].reset_index()
    df2 = edges[edges['id_2'] == i].reset_index()

    df = df1[['id_2', 'id_1']].rename(columns={'id_1':'id_2', 'id_2':'id_1'}).append(df2[['id_1', 'id_2']])
    df['target'] = 1
    
    df = vertices.set_index('id').join(df.set_index('id_1')['target']).fillna(0)
    X = df.drop(['target'], axis=1)
    y = df['target']
    model = CatBoostClassifier(iterations=100, verbose=False, task_type='GPU',
                               class_weights=[1, (y==0).sum()/(y==1).sum()], 
                               cat_features=cat_features)
    
    model.fit(X, y)
    preds = model.predict_proba(X)[:,1]

    tmp = pd.DataFrame({
        'id_1': [i]*len(preds),
        'id_2': vertices.id,
        'proba': preds
    })
    tmp = tmp[df.reset_index().target != 1]
    tmp = tmp.sort_values(by='proba', ascending=False)[:3000]
    result = pd.concat([result, tmp])
    gc.collect()

In [0]:
idx = result.id_1 < result.id_2
result['ss'] = np.zeros(len(result)).astype(str)
result.loc[idx, 'ss'] =  result.id_1[idx] + '_' + result.id_2[idx]
result.loc[~idx, 'ss'] =  result.id_2[~idx] + '_' + result.id_1[~idx]
result = result.drop_duplicates(subset='ss')
idx = result['ss'].isin(edges_concat.ss)
result = result[~idx]

In [0]:
result.to_csv('grouped_result.csv')

## Blending all the models we got

In [0]:
result_svd = pd.read_csv('result_svd.csv',  dtype={'id_1': str, 'id_2': str})
result_stacking = pd.read_csv('result_cb.csv', dtype={'id_1': str, 'id_2': str})
result_zeros = pd.read_csv('result_zeros.csv', dtype={'id_1': str, 'id_2': str})
node_result = pd.read_csv('node2vec_result.csv', dtype={'id_1': str, 'id_2': str})
grouped_result = pd.read_csv('grouped_result.csv', dtype={'id_1': str, 'id_2': str})

In [0]:
idx = node_result.id_1 < node_result.id_2
node_result['ss'] = np.zeros(len(node_result)).astype(str)
node_result.loc[idx, 'ss'] =  node_result.id_1[idx] + '_' + node_result.id_2[idx]
node_result.loc[~idx, 'ss'] =  node_result.id_2[~idx] + '_' + node_result.id_1[~idx]
node_result = node_result.drop_duplicates(subset='ss')
idx = node_result['ss'].isin(edges_concat.ss)
node_result = node_result[~idx]
node_result = node_result.rename({'preds': 'proba'}, axis=1
)

In [0]:
node_result = node_result.sort_values(by='proba', ascending=False).head(150000)

In [0]:
submit = pd.concat([
    grouped_result[(grouped_result.ss.isin(result_stacking.ss)) | (grouped_result.ss.isin(result_svd.ss)) | (grouped_result.ss.isin(node_result.ss))],
    result_svd[((result_svd.ss.isin(node_result.ss)) | (result_svd.ss.isin(result_stacking.ss))) & (~result_svd.ss.isin(grouped_result.ss))],
    node_result[(node_result.ss.isin(result_stacking.ss)) & (~node_result.ss.isin(result_svd.ss)) & (~node_result.ss.isin(grouped_result.ss))],
    result_zeros.sort_values(by ='proba', ascending=False)[:35000],
    result_svd[~((result_svd.ss.isin(node_result.ss)) | (result_svd.ss.isin(result_stacking.ss))) | (~result_svd.ss.isin(grouped_result.ss))][:100000]
])

In [0]:
subm = submit.reset_index().drop('index', axis=1).reset_index().sort_values(by='index').drop_duplicates('ss')[:100000]

In [0]:
subm = subm[['id_1', 'id_2']]

In [0]:
subm.to_csv('submit.csv', index=False)