In [None]:
from google.colab import drive
import warnings
import json
import pandas as pd
import os
import numpy as np
from itertools import product
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


warnings.filterwarnings("ignore")
drive.mount('/content/drive')
root = '/content/drive/MyDrive/Desys_Group/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preparation

## Naive Matrix

In [None]:
naive_matrix = pd.read_csv(filepath_or_buffer=os.path.join(root, "naive_matrix.csv"), index_col=0)
display(naive_matrix.head())
display(naive_matrix.shape)

Unnamed: 0,asset_name,num_sales,contract_type,nft_version,safelist,collection_loyalty,collection_slug,from_addr,to_addr,asset_loyality,event_type,event_id,payment_type,price_decimal,eth_price,usd_price,tot_price,absolute_price,image_url,count
363104,Chum Chums #993,1.0,CHUMCHUMS,3.0,approved,600.0,chumchumsnft,0xc58a54ac5e910c818ccf40ccbbde0c6e5e1da27f,0x0000000035634b55f3d99b071b5a354f48e10bef,600.0,successful,4928390110,Ether,18.0,1.0,3019.07,8e+16,241.5266,https://lh3.googleusercontent.com/IlCMJrHDR_oC...,1
363085,Froyo Kittens #1579,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999853,Ether,18.0,1.0,3019.07,2.9e+17,875.5313,https://lh3.googleusercontent.com/SsA4B7yPZUt2...,1
363086,Froyo Kittens #2313,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999727,Ether,18.0,1.0,3019.07,2.842e+17,858.020694,https://lh3.googleusercontent.com/uV0kmKNIfieD...,1
363087,Froyo Kittens #7474,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999578,Ether,18.0,1.0,3019.07,2.9e+17,875.5313,https://lh3.googleusercontent.com/EJSrHw2ui1NM...,1
363088,Froyo Kittens #7722,2.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999435,Ether,18.0,1.0,3019.07,2.846e+17,859.228322,https://lh3.googleusercontent.com/HuYJiPisbdju...,1


(356427, 20)

## Asset Matrix

In [None]:
asset_groupby = naive_matrix

aggregation_functions = {
    'collection_slug': pd.Series.nunique,
    'num_sales': [np.median, np.sum],
    'contract_type': 'last',
    'nft_version': 'last',
    'safelist': lambda gb: 1 - sum(gb == 'not_requested') / len(gb),
    'collection_loyalty': [np.median, np.sum],
    'event_type': lambda gb: sum(gb == 'successful') / len(gb),
    'payment_type': 'last',
    'absolute_price': [np.median, np.sum]
}

asset_groupby = asset_groupby.groupby('asset_name').agg(aggregation_functions)
asset_df = asset_groupby
asset_df.columns = ['_'.join(col).strip() for col in asset_df.columns.values]
asset_df = asset_df.rename(columns={'safelist_<lambda>': 'safelist_rate', 'event_type_<lambda>': 'successful_rate'})
asset_df = asset_df.reset_index()
display(asset_df.head())
display(asset_df.shape)

Unnamed: 0,asset_name,collection_slug_nunique,num_sales_median,num_sales_sum,contract_type_last,nft_version_last,safelist_rate,collection_loyalty_median,collection_loyalty_sum,successful_rate,payment_type_last,absolute_price_median,absolute_price_sum
0,"""ASTRIRM"" (CryptoSkull #2317) #2/10",1,1.0,1.0,B1UE,3.0,0.0,1000.0,1000.0,1.0,Ether,91.4353,91.4353
1,"""POTATO"" COMPLETE #15",1,3.0,3.0,JNK,3.0,1.0,500.0,500.0,1.0,Ether,983.8282,983.8282
2,"""POTATO"" COMPLETE #67",1,2.0,2.0,JNK,3.0,1.0,500.0,500.0,1.0,Ether,975.10744,975.10744
3,"""POTATO"" DMND #5",1,2.0,2.0,JNK,3.0,1.0,500.0,500.0,1.0,Ether,3352.592,3352.592
4,"""SAVIOR"" (CryptoSkull #9817) #1/5",1,2.0,2.0,B1UE,3.0,0.0,1000.0,1000.0,1.0,Ether,106.67435,106.67435


(250915, 13)

## Asset-based Design Matrix

In [None]:
all_to_addr = naive_matrix['to_addr'].unique()
all_asset_name = asset_df['asset_name'].unique()
design_matrix = pd.DataFrame(list(product(all_to_addr, all_asset_name)), columns=['to_addr', 'asset_name'])
design_matrix = design_matrix.merge(asset_df, on=['asset_name'], how='left')
label_matrix = naive_matrix[['to_addr', 'asset_name']]
label_matrix['label'] = 1
design_matrix = design_matrix.merge(label_matrix.groupby(['to_addr', 'asset_name']).first(), on=['to_addr', 'asset_name'], how='left')
design_matrix['label'] = design_matrix['label'].fillna(0)

# convert feature type
ss = StandardScaler()
numerical_cols = ["num_sales_median", "num_sales_sum", "collection_slug_nunique", "safelist_rate", "collection_loyalty_median", \
                  "collection_loyalty_sum", "successful_rate", "absolute_price_median", "absolute_price_sum"]
for k in design_matrix.keys():
    if k not in numerical_cols:
        design_matrix[k] = design_matrix[k].astype(str)

# normalize
design_matrix_normalized = design_matrix
design_matrix_normalized[numerical_cols] = pd.DataFrame(ss.fit_transform(design_matrix_normalized[numerical_cols]), columns=numerical_cols, index=design_matrix_normalized.index)

# sample zero labels
ZERO_THRES = 10
label_1 = design_matrix_normalized[design_matrix_normalized['label'] != '0.0']
label_0 = design_matrix_normalized[design_matrix_normalized['label'] == '0.0'].sample(label_1.shape[0] * ZERO_THRES, random_state=1234)
design_matrix_normalized = pd.concat([label_1, label_0])
design_matrix_normalized = design_matrix_normalized.sample(100000, random_state=1234)

display(design_matrix_normalized['label'].value_counts())
display(design_matrix_normalized.dtypes)
display(design_matrix_normalized.head())
display(design_matrix_normalized.shape)

0.0    90922
1.0     9078
Name: label, dtype: int64

to_addr                       object
asset_name                    object
collection_slug_nunique      float64
num_sales_median             float64
num_sales_sum                float64
contract_type_last            object
nft_version_last              object
safelist_rate                float64
collection_loyalty_median    float64
collection_loyalty_sum       float64
successful_rate              float64
payment_type_last             object
absolute_price_median        float64
absolute_price_sum           float64
label                         object
dtype: object

Unnamed: 0,to_addr,asset_name,collection_slug_nunique,num_sales_median,num_sales_sum,contract_type_last,nft_version_last,safelist_rate,collection_loyalty_median,collection_loyalty_sum,successful_rate,payment_type_last,absolute_price_median,absolute_price_sum,label
3659376,0x603d022611bfe6a101dcdab207d96c527f1d4d8e,MetaPirate #1409,-0.120201,0.165121,-0.029699,MP,3.0,0.681748,1.562743,0.005417,0.10234,Ether,-0.251913,-0.10711,0.0
1116911,0x0a267cf51ef038fc00e71801f5a524aec06e4f07,KENKYO: #110,-0.120201,-0.718454,-0.074197,KENKYO,3.0,-1.479222,0.14292,-0.02264,0.10234,Ether,0.180642,0.011337,0.0
2600719,0x6b58007b960016b2f559dbfd809ac4dcb1febdfe,Gen1 #14143,-0.120201,-0.718454,-0.074197,SQ,3.0,0.681748,0.379557,-0.017964,0.10234,Ether,-0.239575,-0.103731,0.0
813087,0x2af4b707e1dce8fc345f38cfeeaa2421e54976d5,Dealer 2559,-0.120201,-0.718454,-0.074197,DLR,3.0,-1.479222,0.379557,-0.017964,0.10234,Ether,-0.230964,-0.101373,0.0
351980,0x7f268357a8c2552623316e2562d90e642bb538e5,HAPE Community Badge,-0.120201,-0.718454,0.904764,HAPEBADGE,3.0,-1.479222,-0.803629,0.893904,0.10234,Ether,-0.252723,-0.096221,1.0


(100000, 15)

# Train Test Split

## CSV

In [None]:
train, test = train_test_split(design_matrix_normalized, test_size=0.1)
train, valid = train_test_split(train, test_size=0.111111)

def write_df_to_csv(df, file_path):
  print(file_path, df.shape)
  train.to_csv(root+file_path)

write_df_to_csv(train, 'train_asset.csv')
write_df_to_csv(valid, 'valid_asset.csv')
write_df_to_csv(test, 'test_asset.csv')

train_asset.csv (80000, 15)
valid_asset.csv (10000, 15)
test_asset.csv (10000, 15)


## FFM

In [None]:
!pip install -q recommenders
from recommenders.datasets.pandas_df_utils import LibffmConverter

In [None]:
converter = LibffmConverter()
df_out = converter.fit_transform(design_matrix_normalized.copy(), col_rating='label')
meta_dict = {"field_count": converter.field_count, "feature_count": converter.feature_count}

with open(os.path.join(root, 'meta_asset.json'), 'w') as convert_file:
  convert_file.write(json.dumps(meta_dict))

print(f"field_count={converter.field_count} feature_count={converter.feature_count}")

field_count=14 feature_count=85806


In [None]:
train_ffm, test_ffm = train_test_split(df_out, test_size=0.1)
train_ffm, valid_ffm = train_test_split(train_ffm, test_size=0.111111)

def write_df_to_ffm(df, file_path):
  print(file_path, df.shape)
  np.savetxt(os.path.join(root, file_path), df.values, delimiter=" ", fmt="%s")

write_df_to_ffm(train_ffm, "train_asset.ffm")
write_df_to_ffm(valid_ffm, "valid_asset.ffm")
write_df_to_ffm(test_ffm, "test_asset.ffm")

train_asset.ffm (80000, 15)
valid_asset.ffm (10000, 15)
test_asset.ffm (10000, 15)
