In [None]:
from google.colab import drive
import warnings
import json
import pandas as pd
import os
import numpy as np
from itertools import product
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")
drive.mount('/content/drive')
root = '/content/drive/MyDrive/Desys_Group/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preparation


## Naive Matrix

In [None]:
naive_matrix = pd.read_csv(filepath_or_buffer=os.path.join(root, "naive_matrix.csv"), index_col=0)
display(naive_matrix.head())
display(naive_matrix.shape)

Unnamed: 0,asset_name,num_sales,contract_type,nft_version,safelist,collection_loyalty,collection_slug,from_addr,to_addr,asset_loyality,event_type,event_id,payment_type,price_decimal,eth_price,usd_price,tot_price,absolute_price,count
363104,Chum Chums #993,1.0,CHUMCHUMS,3.0,approved,600.0,chumchumsnft,0xc58a54ac5e910c818ccf40ccbbde0c6e5e1da27f,0x0000000035634b55f3d99b071b5a354f48e10bef,600.0,successful,4928390110,Ether,18.0,1.0,3019.07,8e+16,241.5266,1
363085,Froyo Kittens #1579,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999853,Ether,18.0,1.0,3019.07,2.9e+17,875.5313,1
363086,Froyo Kittens #2313,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999727,Ether,18.0,1.0,3019.07,2.842e+17,858.020694,1
363087,Froyo Kittens #7474,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999578,Ether,18.0,1.0,3019.07,2.9e+17,875.5313,1
363088,Froyo Kittens #7722,2.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999435,Ether,18.0,1.0,3019.07,2.846e+17,859.228322,1


(356427, 19)

## Collection Matrix

In [None]:
collection_groupby = naive_matrix

aggregation_functions = {
    'asset_name': pd.Series.nunique,
    'num_sales': [np.median, np.sum],
    'contract_type': 'last',
    'nft_version': 'last',
    'safelist': lambda gb: 1 - sum(gb == 'not_requested') / len(gb),
    'collection_loyalty': [np.median, np.sum],
    'event_type': lambda gb: sum(gb == 'successful') / len(gb),
    'payment_type': 'last',
    'absolute_price': [np.median, np.sum]
}

collection_groupby = collection_groupby.groupby('collection_slug').agg(aggregation_functions)
collection_df = collection_groupby
collection_df.columns = ['_'.join(col).strip() for col in collection_df.columns.values]
collection_df = collection_df.rename(columns={'safelist_<lambda>': 'safelist_rate', 'event_type_<lambda>': 'successful_rate'})
collection_df = collection_df.reset_index()
display(collection_df.head())
display(collection_df.shape)

Unnamed: 0,collection_slug,asset_name_nunique,num_sales_median,num_sales_sum,contract_type_last,nft_version_last,safelist_rate,collection_loyalty_median,collection_loyalty_sum,successful_rate,payment_type_last,absolute_price_median,absolute_price_sum
0,-gutter-clones,3,1.5,6.0,GC,3.0,0.0,500.0,2000.0,1.0,Ether,951.54637,4182.23242
1,-nefturians,4,1.0,4.0,NFTR,3.0,0.0,1000.0,4000.0,1.0,Ether,79.21293,316.090475
2,0edit,38,1.0,63.0,E0,3.0,0.0,1000.0,41000.0,1.0,Ether,152.8395,8101.311969
3,0mni-punks,1,1.0,656.0,0xpunks,3.0,0.0,1000.0,563000.0,1.0,Ether,31.791304,21159.124535
4,0n1-force,71,4.0,324.0,0N1,3.0,1.0,500.0,40000.0,0.9,Ether,1528.381,179642.34849


(4427, 13)

## Collection-based Design Matrix



In [None]:
all_to_addr = naive_matrix['to_addr'].unique()
all_collection_slug = collection_df['collection_slug'].unique()
design_matrix = pd.DataFrame(list(product(all_to_addr, all_collection_slug)), columns=['to_addr', 'collection_slug'])
design_matrix = design_matrix.merge(collection_df, on=['collection_slug'], how='left')
label_matrix = naive_matrix[['to_addr', 'collection_slug']]
label_matrix['label'] = 1
design_matrix = design_matrix.merge(label_matrix.groupby(['to_addr', 'collection_slug']).first(), on=['to_addr', 'collection_slug'], how='left')
design_matrix['label'] = design_matrix['label'].fillna(0)

# convert feature type
ss = StandardScaler()
numerical_cols = ["num_sales_median", "num_sales_sum", "asset_name_nunique", "safelist_rate", "collection_loyalty_median", \
                  "collection_loyalty_sum", "successful_rate", "absolute_price_median", "absolute_price_sum"]
for k in design_matrix.keys():
    if k not in numerical_cols:
        design_matrix[k] = design_matrix[k].astype(str)

# normalize
design_matrix_normalized = design_matrix
design_matrix_normalized[numerical_cols] = pd.DataFrame(ss.fit_transform(design_matrix_normalized[numerical_cols]), columns=numerical_cols, index=design_matrix_normalized.index)

# sample zero labels
ZERO_THRES = 4
label_1 = design_matrix_normalized[design_matrix_normalized['label'] != '0.0']
label_0 = design_matrix_normalized[design_matrix_normalized['label'] == '0.0'].sample(label_1.shape[0] * ZERO_THRES)
design_matrix_normalized = pd.concat([label_1, label_0])

display(design_matrix_normalized['label'].value_counts())
display(design_matrix_normalized.dtypes)
display(design_matrix_normalized.head())
display(design_matrix_normalized.shape)

0.0    32520
1.0     8130
Name: label, dtype: int64

to_addr                       object
collection_slug               object
asset_name_nunique           float64
num_sales_median             float64
num_sales_sum                float64
contract_type_last            object
nft_version_last              object
safelist_rate                float64
collection_loyalty_median    float64
collection_loyalty_sum       float64
successful_rate              float64
payment_type_last             object
absolute_price_median        float64
absolute_price_sum           float64
label                         object
dtype: object

Unnamed: 0,to_addr,collection_slug,asset_name_nunique,num_sales_median,num_sales_sum,contract_type_last,nft_version_last,safelist_rate,collection_loyalty_median,collection_loyalty_sum,successful_rate,payment_type_last,absolute_price_median,absolute_price_sum,label
3,0x0000000035634b55f3d99b071b5a354f48e10bef,0mni-punks,-0.209532,-0.651607,0.634057,0xpunks,3.0,-0.956016,1.230829,1.803643,0.203673,Ether,-0.131861,-0.039607,1.0
4,0x0000000035634b55f3d99b071b5a354f48e10bef,0n1-force,0.04564,3.56163,0.214048,0N1,3.0,1.046009,-0.732748,-0.051125,-1.436326,Ether,0.054574,0.024196,1.0
7,0x0000000035634b55f3d99b071b5a354f48e10bef,0xbakc,-0.184015,-0.651607,-0.184455,0xBAKC,3.0,-0.956016,-0.732748,-0.178795,0.203673,Ether,-0.134301,-0.048081,1.0
9,0x0000000035634b55f3d99b071b5a354f48e10bef,0xblack-og,-0.111108,-0.651607,-0.160419,0xBLACK,3.0,-0.956016,0.052683,-0.123471,0.203673,Ether,-0.124441,-0.047017,1.0
18,0x0000000035634b55f3d99b071b5a354f48e10bef,0xkarafuru,-0.209532,-0.651607,-0.142707,0xKarafuru,3.0,-0.956016,-0.732748,-0.122053,0.203673,Ether,-0.135441,-0.048089,1.0


(40650, 15)

# Train Test Split

## CSV

In [None]:
train, test = train_test_split(design_matrix_normalized, test_size=0.1)
train, valid = train_test_split(train, test_size=0.111111)

def write_df_to_csv(df, file_path):
  print(file_path, df.shape)
  train.to_csv(root+file_path)

write_df_to_csv(train, 'train_collection.csv')
write_df_to_csv(valid, 'valid_collection.csv')
write_df_to_csv(test, 'test_collection.csv')

train_collection.csv (32520, 15)
valid_collection.csv (4065, 15)
test_collection.csv (4065, 15)


## FFM

In [None]:
!pip install -q recommenders
from recommenders.datasets.pandas_df_utils import LibffmConverter

In [None]:
converter = LibffmConverter()
df_out = converter.fit_transform(design_matrix_normalized, col_rating='label')
meta_dict = {"field_count": converter.field_count, "feature_count": converter.feature_count}

with open(os.path.join(root, 'meta_collection.json'), 'w') as convert_file:
  convert_file.write(json.dumps(meta_dict))

print(f"field_count={converter.field_count} feature_count={converter.feature_count}")

field_count=14 feature_count=8278


In [None]:
train_ffm, test_ffm = train_test_split(df_out, test_size=0.1)
train_ffm, valid_ffm = train_test_split(train_ffm, test_size=0.111111)

def write_df_to_ffm(df, file_path):
  print(file_path, df.shape)
  np.savetxt(os.path.join(root, file_path), df.values, delimiter=" ", fmt="%s")

write_df_to_ffm(train_ffm, "train_collection.ffm")
write_df_to_ffm(valid_ffm, "valid_collection.ffm")
write_df_to_ffm(test_ffm, "test_collection.ffm")

train_collection.ffm (32520, 15)
valid_collection.ffm (4065, 15)
test_collection.ffm (4065, 15)
