In [None]:
!pip install -q scrapbook
!pip install -q recommenders

In [None]:
from google.colab import drive
import warnings
import json
import pandas as pd
import os
import numpy as np
from itertools import product
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import scrapbook as sb
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages
from recommenders.utils.constants import SEED
from recommenders.models.deeprec.deeprec_utils import (
    download_deeprec_resources, prepare_hparams
)
from recommenders.models.deeprec.models.xDeepFM import XDeepFMModel
from recommenders.models.deeprec.io.iterator import FFMTextIterator
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

warnings.filterwarnings("ignore")
drive.mount('/content/drive')
root = '/content/drive/MyDrive/Desys_Group/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Preparation

## Naive Matrix

In [None]:
naive_matrix = pd.read_csv(filepath_or_buffer=os.path.join(root, "naive_matrix.csv"), index_col=0)
display(naive_matrix.head())
display(naive_matrix.shape)

Unnamed: 0,asset_name,num_sales,contract_type,nft_version,safelist,collection_loyalty,collection_slug,from_addr,to_addr,asset_loyality,event_type,event_id,payment_type,price_decimal,eth_price,usd_price,tot_price,absolute_price,image_url,count
363104,Chum Chums #993,1.0,CHUMCHUMS,3.0,approved,600.0,chumchumsnft,0xc58a54ac5e910c818ccf40ccbbde0c6e5e1da27f,0x0000000035634b55f3d99b071b5a354f48e10bef,600.0,successful,4928390110,Ether,18.0,1.0,3019.07,8e+16,241.5266,https://lh3.googleusercontent.com/IlCMJrHDR_oC...,1
363085,Froyo Kittens #1579,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999853,Ether,18.0,1.0,3019.07,2.9e+17,875.5313,https://lh3.googleusercontent.com/SsA4B7yPZUt2...,1
363086,Froyo Kittens #2313,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999727,Ether,18.0,1.0,3019.07,2.842e+17,858.020694,https://lh3.googleusercontent.com/uV0kmKNIfieD...,1
363087,Froyo Kittens #7474,1.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999578,Ether,18.0,1.0,3019.07,2.9e+17,875.5313,https://lh3.googleusercontent.com/EJSrHw2ui1NM...,1
363088,Froyo Kittens #7722,2.0,FroyoKitten,3.0,approved,500.0,froyokittenscollection,0x326ef9fa575a92090d8dea0b1f053afca64fb19b,0x0000000035634b55f3d99b071b5a354f48e10bef,500.0,successful,4928999435,Ether,18.0,1.0,3019.07,2.846e+17,859.228322,https://lh3.googleusercontent.com/HuYJiPisbdju...,1


(356427, 20)

In [None]:
asset_url_map = naive_matrix[['asset_name', 'image_url']].groupby('asset_name').agg('last')['image_url'].to_dict()
asset_collection_map = naive_matrix[['asset_name', 'collection_slug']].groupby('asset_name').agg('last')['collection_slug'].to_dict()

## Asset Matrix

In [None]:
asset_groupby = naive_matrix

aggregation_functions = {
    'collection_slug': pd.Series.nunique,
    'num_sales': [np.median, np.sum],
    'contract_type': 'last',
    'nft_version': 'last',
    'safelist': lambda gb: 1 - sum(gb == 'not_requested') / len(gb),
    'collection_loyalty': [np.median, np.sum],
    'event_type': lambda gb: sum(gb == 'successful') / len(gb),
    'payment_type': 'last',
    'absolute_price': [np.median, np.sum],
}

asset_groupby = asset_groupby.groupby('asset_name').agg(aggregation_functions)
asset_df = asset_groupby
asset_df.columns = ['_'.join(col).strip() for col in asset_df.columns.values]
asset_df = asset_df.rename(columns={'safelist_<lambda>': 'safelist_rate', 'event_type_<lambda>': 'successful_rate'})
asset_df = asset_df.reset_index()
display(asset_df.head())
display(asset_df.shape)

Unnamed: 0,asset_name,collection_slug_nunique,num_sales_median,num_sales_sum,contract_type_last,nft_version_last,safelist_rate,collection_loyalty_median,collection_loyalty_sum,successful_rate,payment_type_last,absolute_price_median,absolute_price_sum
0,"""ASTRIRM"" (CryptoSkull #2317) #2/10",1,1.0,1.0,B1UE,3.0,0.0,1000.0,1000.0,1.0,Ether,91.4353,91.4353
1,"""POTATO"" COMPLETE #15",1,3.0,3.0,JNK,3.0,1.0,500.0,500.0,1.0,Ether,983.8282,983.8282
2,"""POTATO"" COMPLETE #67",1,2.0,2.0,JNK,3.0,1.0,500.0,500.0,1.0,Ether,975.10744,975.10744
3,"""POTATO"" DMND #5",1,2.0,2.0,JNK,3.0,1.0,500.0,500.0,1.0,Ether,3352.592,3352.592
4,"""SAVIOR"" (CryptoSkull #9817) #1/5",1,2.0,2.0,B1UE,3.0,0.0,1000.0,1000.0,1.0,Ether,106.67435,106.67435


(250915, 13)

## Asset-based Design Matrix

In [None]:
all_to_addr = naive_matrix['to_addr'].unique()
all_asset_name = asset_df['asset_name'].unique()
design_matrix = pd.DataFrame(list(product(all_to_addr, all_asset_name)), columns=['to_addr', 'asset_name'])
design_matrix = design_matrix.merge(asset_df, on=['asset_name'], how='left')
label_matrix = naive_matrix[['to_addr', 'asset_name']]
label_matrix['label'] = 1
design_matrix = design_matrix.merge(label_matrix.groupby(['to_addr', 'asset_name']).first(), on=['to_addr', 'asset_name'], how='left')
design_matrix['label'] = design_matrix['label'].fillna(0)

# convert feature type
ss = StandardScaler()
numerical_cols = ["num_sales_median", "num_sales_sum", "collection_slug_nunique", "safelist_rate", "collection_loyalty_median", \
                  "collection_loyalty_sum", "successful_rate", "absolute_price_median", "absolute_price_sum"]
for k in design_matrix.keys():
    if k not in numerical_cols:
        design_matrix[k] = design_matrix[k].astype(str)

# normalize
design_matrix_normalized = design_matrix
design_matrix_normalized[numerical_cols] = pd.DataFrame(ss.fit_transform(design_matrix_normalized[numerical_cols]), columns=numerical_cols, index=design_matrix_normalized.index)

# sample zero labels
ZERO_THRES = 10
label_1 = design_matrix_normalized[design_matrix_normalized['label'] != '0.0']
label_0 = design_matrix_normalized[design_matrix_normalized['label'] == '0.0'].sample(label_1.shape[0] * ZERO_THRES, random_state=1234)
design_matrix_normalized = pd.concat([label_1, label_0])
design_matrix_normalized = design_matrix_normalized.sample(100000, random_state=1234)

display(design_matrix_normalized['label'].value_counts())
display(design_matrix_normalized.dtypes)
display(design_matrix_normalized.head())
display(design_matrix_normalized.shape)

0.0    90922
1.0     9078
Name: label, dtype: int64

to_addr                       object
asset_name                    object
collection_slug_nunique      float64
num_sales_median             float64
num_sales_sum                float64
contract_type_last            object
nft_version_last              object
safelist_rate                float64
collection_loyalty_median    float64
collection_loyalty_sum       float64
successful_rate              float64
payment_type_last             object
absolute_price_median        float64
absolute_price_sum           float64
label                         object
dtype: object

Unnamed: 0,to_addr,asset_name,collection_slug_nunique,num_sales_median,num_sales_sum,contract_type_last,nft_version_last,safelist_rate,collection_loyalty_median,collection_loyalty_sum,successful_rate,payment_type_last,absolute_price_median,absolute_price_sum,label
3659376,0x603d022611bfe6a101dcdab207d96c527f1d4d8e,MetaPirate #1409,-0.120201,0.165121,-0.029699,MP,3.0,0.681748,1.562743,0.005417,0.10234,Ether,-0.251913,-0.10711,0.0
1116911,0x0a267cf51ef038fc00e71801f5a524aec06e4f07,KENKYO: #110,-0.120201,-0.718454,-0.074197,KENKYO,3.0,-1.479222,0.14292,-0.02264,0.10234,Ether,0.180642,0.011337,0.0
2600719,0x6b58007b960016b2f559dbfd809ac4dcb1febdfe,Gen1 #14143,-0.120201,-0.718454,-0.074197,SQ,3.0,0.681748,0.379557,-0.017964,0.10234,Ether,-0.239575,-0.103731,0.0
813087,0x2af4b707e1dce8fc345f38cfeeaa2421e54976d5,Dealer 2559,-0.120201,-0.718454,-0.074197,DLR,3.0,-1.479222,0.379557,-0.017964,0.10234,Ether,-0.230964,-0.101373,0.0
351980,0x7f268357a8c2552623316e2562d90e642bb538e5,HAPE Community Badge,-0.120201,-0.718454,0.904764,HAPEBADGE,3.0,-1.479222,-0.803629,0.893904,0.10234,Ether,-0.252723,-0.096221,1.0


(100000, 15)

# Train Test Split

## FFM

In [None]:
from recommenders.datasets.pandas_df_utils import LibffmConverter

In [None]:
converter = LibffmConverter()
df_out = converter.fit_transform(design_matrix_normalized.copy(), col_rating='label')
meta_dict = {"field_count": converter.field_count, "feature_count": converter.feature_count}

with open(os.path.join(root, 'meta_asset.json'), 'w') as convert_file:
  convert_file.write(json.dumps(meta_dict))

print(f"field_count={converter.field_count} feature_count={converter.feature_count}")

field_count=14 feature_count=85806


In [None]:
train_ffm, test_ffm = train_test_split(df_out, test_size=0.1)
train_ffm, valid_ffm = train_test_split(train_ffm, test_size=0.111111)
mine_ffm = df_out

def write_df_to_ffm(df, file_path):
  print(file_path, df.shape)
  np.savetxt(os.path.join(root, file_path), df.values, delimiter=" ", fmt="%s")

write_df_to_ffm(train_ffm, "train_asset.ffm")
write_df_to_ffm(valid_ffm, "valid_asset.ffm")
write_df_to_ffm(mine_ffm, "mine_asset.ffm")

train_asset.ffm (80000, 15)
valid_asset.ffm (10000, 15)
mine_asset.ffm (100000, 15)


In [None]:
with open(os.path.join(root, r'meta_asset.json')) as meta_f:
    meta = json.load(meta_f)
EPOCHS_FOR_SYNTHETIC_RUN = 15
EPOCHS_FOR_CRITEO_RUN = 10
EPOCHS_FOR_OPENSEA_RUN = 5
BATCH_SIZE_SYNTHETIC = 128
BATCH_SIZE_CRITEO = 4096
BATCH_SIZE_OPENSEA = 256
RANDOM_SEED = SEED  # set to None for non-deterministic result
FIELD_COUNT = meta['field_count']
FEATURE_COUNT = meta['feature_count'] 

FIELD_COUNT, FEATURE_COUNT

(14, 85806)

In [None]:
import time
from os.path import join

"""
Override function to collect metrics
"""
class MyXDeepFMModel(XDeepFMModel):
    def fit(self, train_file, valid_file, steps, train_loss, eval_loss, eval_auc):
        print("Method override")
        """Fit the model with `train_file`. Evaluate the model on valid_file per epoch to observe the training status.
        If `test_file` is not None, evaluate it too.

        Args:
            train_file (str): training data set.
            valid_file (str): validation set.
            test_file (str): test set.

        Returns:
            object: An instance of self.
        """
        if self.hparams.write_tfevents:
            self.writer = tf.compat.v1.summary.FileWriter(
                self.hparams.SUMMARIES_DIR, self.sess.graph
            )

        train_sess = self.sess
        for epoch in range(1, self.hparams.epochs + 1):
            step = 0
            self.hparams.current_epoch = epoch

            epoch_loss = 0
            train_start = time.time()
            for (
                batch_data_input,
                impression,
                data_size,
            ) in self.iterator.load_data_from_file(train_file):
                step_result = self.train(train_sess, batch_data_input)
                (_, _, step_loss, step_data_loss, summary) = step_result
                if self.hparams.write_tfevents:
                    self.writer.add_summary(summary, step)
                epoch_loss += step_loss
                step += 1
                if step % self.hparams.show_step == 0:
                    print(
                        "step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}".format(
                            step, step_loss, step_data_loss
                        )
                    )

            train_end = time.time()
            train_time = train_end - train_start

            if self.hparams.save_model:
                if not os.path.exists(self.hparams.MODEL_DIR):
                    os.makedirs(self.hparams.MODEL_DIR)
                if epoch % self.hparams.save_epoch == 0:
                    save_path_str = join(self.hparams.MODEL_DIR, "epoch_" + str(epoch))
                    self.saver.save(sess=train_sess, save_path=save_path_str)

            eval_start = time.time()
            eval_res = self.run_eval(valid_file)
            train_info = ",".join(
                [
                    str(item[0]) + ":" + str(item[1])
                    for item in [("logloss loss", epoch_loss / step)]
                ]
            )
            eval_info = ", ".join(
                [
                    str(item[0]) + ":" + str(item[1])
                    for item in sorted(eval_res.items(), key=lambda x: x[0])
                ]
            )

            eval_end = time.time()
            eval_time = eval_end - eval_start


            print(
                "at epoch {0:d}".format(epoch)
                + "\ntrain info: "
                + train_info
                + "\neval info: "
                + eval_info
            )
            print(
                "at epoch {0:d} , train time: {1:.1f} eval time: {2:.1f}".format(
                    epoch, train_time, eval_time
                )
            )

            steps.append(epoch)
            train_loss.append(epoch_loss/step)
            eval_loss.append(eval_res['logloss'])
            eval_auc.append(eval_res['auc'])


        if self.hparams.write_tfevents:
            self.writer.close()

        return self

In [None]:
yaml_file = os.path.join(root, r'xDeepFM.yaml')
train_file = os.path.join(root, r'train_asset.ffm')
valid_file = os.path.join(root, r'valid_asset.ffm')
test_file = os.path.join(root, r'mine_asset.ffm')
output_file = os.path.join(root, r'output_mine.txt')

In [None]:
yaml_file = os.path.join(root, r'xDeepFM.yaml')

hparams = prepare_hparams(yaml_file, 
                          FEATURE_COUNT=FEATURE_COUNT, 
                          FIELD_COUNT=FIELD_COUNT, 
                          cross_l2=0.001, 
                          embed_l2=0.001, 
                          layer_l2=0.001,
                          learning_rate=0.0001, 
                          batch_size=BATCH_SIZE_OPENSEA, 
                          epochs=EPOCHS_FOR_OPENSEA_RUN, 
                          cross_layer_sizes=[20, 10], 
                          init_value=0.1, 
                          layer_sizes=[20,20],
                          use_Linear_part=True, 
                          use_CIN_part=True, 
                          use_DNN_part=True,
                          user_dropout=True,
                          dropout=[0.2, 0.2],
                          # load_saved_model=True,
                          # load_model_name="epoch_4"
                          )

input_creator = FFMTextIterator

## sometimes we don't want to train a model from scratch
## then we can load a pre-trained model like this: 
## model.load_model(r'your_model_path')
model = MyXDeepFMModel(hparams, input_creator, seed=RANDOM_SEED)
print(model.run_eval(test_file))
steps, train_loss, eval_loss, eval_auc = [], [], [], []
model.fit(train_file, valid_file, steps, train_loss, eval_loss, eval_auc)
res_syn = model.run_eval(test_file)
print(res_syn)
sb.glue("res_syn", res_syn)
model.predict(test_file, output_file)

Add linear part.
Add CIN part.
Add DNN part.
{'auc': 0.4593, 'logloss': 0.7781}
Method override
at epoch 1
train info: logloss loss:3.49217805466332
eval info: auc:0.6406, logloss:0.5738
at epoch 1 , train time: 23.4 eval time: 1.8
at epoch 2
train info: logloss loss:1.9939758286308558
eval info: auc:0.9427, logloss:0.2498
at epoch 2 , train time: 22.4 eval time: 1.8
at epoch 3
train info: logloss loss:1.100544926838372
eval info: auc:0.9825, logloss:0.139
at epoch 3 , train time: 23.2 eval time: 1.8
at epoch 4
train info: logloss loss:0.6121821909095533
eval info: auc:0.9913, logloss:0.0825
at epoch 4 , train time: 22.6 eval time: 1.7
at epoch 5
train info: logloss loss:0.35079016776892324
eval info: auc:0.9923, logloss:0.0681
at epoch 5 , train time: 22.4 eval time: 1.6
{'auc': 0.991, 'logloss': 0.0684}


<__main__.MyXDeepFMModel at 0x7f8505eb0390>

In [None]:
result_matrix = design_matrix_normalized[['to_addr', 'asset_name']]
results = []
with open(output_file) as output:
    for line in output.readlines():
        results.append(float(line))
assert len(results) == result_matrix.shape[0]
result_matrix['probability'] = results
result_matrix['image_url'] = result_matrix['asset_name'].map(asset_url_map)
result_matrix['collection_slug'] = result_matrix['asset_name'].map(asset_collection_map)
result_matrix.to_csv(root + 'result_matrix.csv')
result_matrix.head(10)

Unnamed: 0,to_addr,asset_name,probability,image_url,collection_slug
3659376,0x603d022611bfe6a101dcdab207d96c527f1d4d8e,MetaPirate #1409,0.00624,https://lh3.googleusercontent.com/d0omRc2Sx0SS...,meta-pirates-project
1116911,0x0a267cf51ef038fc00e71801f5a524aec06e4f07,KENKYO: #110,0.014545,https://lh3.googleusercontent.com/sL4Xq9ixBbhi...,officialkenkyo
2600719,0x6b58007b960016b2f559dbfd809ac4dcb1febdfe,Gen1 #14143,0.007435,https://lh3.googleusercontent.com/tA17OQrYKnaY...,shadow-quest
813087,0x2af4b707e1dce8fc345f38cfeeaa2421e54976d5,Dealer 2559,0.084278,https://lh3.googleusercontent.com/a0Gc5U27mLx-...,ppa-dealers
351980,0x7f268357a8c2552623316e2562d90e642bb538e5,HAPE Community Badge,0.914298,https://lh3.googleusercontent.com/_xMxy7S9sXll...,hapebadge
14621234,0xef802fae6eb5b2de438c008a73d7d18c4df049f2,Elysium Chest #324,0.010048,https://lh3.googleusercontent.com/3uQC19Nmij3R...,elysiumclubnft
12783648,0xf2a0aaecaeef1a00d1eb6178fb30de5695519abf,Zukibirds #648,0.00991,https://lh3.googleusercontent.com/TE_vj_uMAl5c...,zukibirds
497593,0x7f268357a8c2552623316e2562d90e642bb538e5,tiny cats #3382,0.864509,https://lh3.googleusercontent.com/sLTpW24TXFBi...,tiny-cats-eth
12500110,0x396b002ec9123610e1e5990d8766c2a08ede8e37,Super Ordinary Villains #831,0.009738,https://lh3.googleusercontent.com/54AANDcB4-a9...,super-ordinary-villains-genesis
37814,0x0000000035634b55f3d99b071b5a354f48e10bef,Chain Scout #138,0.766194,https://storage.opensea.io/files/5a8e7d694894d...,chain-scouts-genesis
