# Test the NCF module under folder [cf_ec2](../cf_ec2) with ml-1m dataset, save the best model (using integrated modules with compile and fit components, with gmf and mlp pretrain)

#### 4/27/2020, test with original paper's dataset, troubleshoot xyin code, figure out the cause of the discrepancy

In [1]:
import numpy as np 
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import (
    Adam,
    Adamax,
    Adagrad,
    SGD,
    RMSprop
)
from tensorflow.keras.layers import (
    Embedding, 
    Input,
    Flatten, 
    Multiply, 
    Concatenate,
    Dense
)

import sys
sys.path.append('../')
from cf_ec2 import (
    GMF,
    MLP,
    NCF,
    Data,
    evaluation,
    evaluation_grouped
)

#### check original paper's dataset

In [2]:
!tree ../metadata/original_dataset/

[01;34m../metadata/original_dataset/[00m
├── item_input
├── labels
├── testNegatives
├── testRatings
├── train
└── user_input

0 directories, 6 files


In [3]:
import pickle

In [4]:
with open('../metadata/original_dataset/train','rb') as fp:
    train = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/testRatings','rb') as fp:
    testRatings = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/testNegatives','rb') as fp:
    testNegatives = pickle.load(fp, encoding='latin1')

In [5]:
type(train), type(testRatings), type(testNegatives)

(scipy.sparse.dok.dok_matrix, list, list)

In [6]:
testRatings[:5]

[[0, 25], [1, 133], [2, 207], [3, 208], [4, 222]]

In [7]:
len(testRatings), len(testNegatives)

(6040, 6040)

In [8]:
with open('../metadata/original_dataset/user_input','rb') as fp:
    user_input = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/item_input','rb') as fp:
    item_input = pickle.load(fp, encoding='latin1')
with open('../metadata/original_dataset/labels','rb') as fp:
    labels = pickle.load(fp, encoding='latin1')    

In [9]:
type(user_input), type(item_input), type(labels)

(list, list, list)

#### reformat the test dataset

In [10]:
for idx,value in enumerate(testRatings):
    if idx<5:
        print('{}: {} //{}'.format(idx,value, testNegatives[idx][:5]))

0: [0, 25] //[1064, 174, 2791, 3373, 269]
1: [1, 133] //[1072, 3154, 3368, 3644, 549]
2: [2, 207] //[2216, 209, 2347, 3, 1652]
3: [3, 208] //[3023, 1489, 1916, 1706, 1221]
4: [4, 222] //[1794, 3535, 108, 593, 466]


In [11]:
%%time
user_test, item_test, labels_test = [],[],[]
for idx in range(len(testRatings)):
    user_test.extend(
        [testRatings[idx][0]]*(len(testNegatives[idx])+1)
    )
    item_test.append(testRatings[idx][1])
    item_test.extend(testNegatives[idx])
    labels_test.append(1)
    labels_test.extend([0]*len(testNegatives[idx]))

CPU times: user 20.3 ms, sys: 8.7 ms, total: 29 ms
Wall time: 28 ms


In [12]:
len(user_test)

604000

In [13]:
model4 = keras.models.load_model('../metadata/ncf4/ncf-best.hdf5')

In [14]:
rating_true = pd.DataFrame({
    'userID':user_test,
    'itemID':item_test,
    'rating':labels_test
})
rating_true.loc[rating_true.userID==3,'itemID'][:10]

300     208
301    3023
302    1489
303    1916
304    1706
305    1221
306    1191
307    2671
308      81
309    2483
Name: itemID, dtype: int64

In [15]:
testNegatives[3][:10]

[3023, 1489, 1916, 1706, 1221, 1191, 2671, 81, 2483, 941]

In [16]:
testRatings[3]

[3, 208]

In [17]:
rating_true.loc[(rating_true.userID==3)&(rating_true.rating==1),'itemID'].values[0]

208

In [18]:
#### try the paper's code with xyin data

from tqdm.notebook import trange
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time
#from numba import jit, autojit

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in trange(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)      
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
#     items = _testNegatives[idx]
#     u = rating[0]
#     gtItem = rating[1]
    items = rating_true.loc[(rating_true.userID==idx)&(rating_true.rating==0),'itemID'].tolist()
    u = idx
    gtItem = rating_true.loc[(rating_true.userID==idx)&(rating_true.rating==1),'itemID'].values[0]
    
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)], 
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [19]:
%%time

hrs,ndcgs = evaluate_model(model4, testRatings, testNegatives, 10, 1)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


CPU times: user 2min 18s, sys: 3.43 s, total: 2min 21s
Wall time: 2min 18s


In [20]:
hr, ndcg = np.array(hrs).mean(), np.array(ndcgs).mean()
hr, ndcg

(0.6504966887417218, 0.37691241349812515)

In [21]:
#### the result is the same as the paper reported!! nothing is wrong with dataframe rating_true

#### test xyin code again

In [22]:
%%time

from tqdm.notebook import trange

## create placeholders for user, item, pred
list_preds = []

## get predictions for each user-item pair
for idx in trange(len(testRatings)):
#     user = user_test[idx*100:(idx+1)*100]
#     item = rating_true.loc
    item = rating_true.loc[rating_true.userID==idx,'itemID'].tolist()
    user = [idx]*len(item)
    list_preds.extend(
        model4.predict(
            x=[
                np.array(user),
                np.array(item)
            ]
        ).flatten()
    )
## create a pandas dataframe
all_predictions_test = pd.DataFrame(data={
    'userID': rating_true.userID,
    'itemID': rating_true.itemID,
    'prediction': list_preds
})

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


CPU times: user 2min 35s, sys: 5.9 s, total: 2min 41s
Wall time: 2min 33s


In [23]:
rating_true.shape

(604000, 3)

In [24]:
all_predictions_test.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,25,0.787151
1,0,1064,0.399748
2,0,174,0.732144


In [25]:
all_predictions_test.shape

(604000, 3)

In [26]:
%%time

k=10
topKItems_test = all_predictions_test.groupby('userID',as_index=False)\
    .apply(lambda items: items.nlargest(k,'prediction'))\
        .reset_index(drop=True)
## append rank
topKItems_test['rnk'] = topKItems_test.groupby('userID',sort=False).cumcount()+1

CPU times: user 9.69 s, sys: 185 ms, total: 9.88 s
Wall time: 9.88 s


In [27]:
topKItems_test.head(20)

Unnamed: 0,userID,itemID,prediction,rnk
0,0,128,0.95447,1
1,0,25,0.787151,2
2,0,174,0.732144,3
3,0,273,0.725887,4
4,0,464,0.595037,5
5,0,175,0.496194,6
6,0,1064,0.399748,7
7,0,1182,0.390909,8
8,0,487,0.38291,9
9,0,1331,0.293911,10


In [28]:
rating_pred = topKItems_test.loc[:,topKItems_test.columns[:3]].copy()
rating_pred.head(3)

Unnamed: 0,userID,itemID,prediction
0,0,128,0.95447
1,0,25,0.787151
2,0,174,0.732144


In [29]:
overlap = pd.merge(
    rating_true.loc[rating_true.rating>0],
    rating_pred,
    on=['userID','itemID']
)

In [30]:
overlap.shape

(4029, 4)

In [31]:
overlap.head(3)

Unnamed: 0,userID,itemID,rating,prediction
0,0,25,1,0.787151
1,1,133,1,0.538144
2,2,207,1,0.29041


In [32]:
overlap.shape[0]/rating_true.loc[rating_true.rating>0].shape[0] #### slightly different from author's code result above

0.6670529801324503

In [33]:
ndcg = evaluation.ndcg_at_k(
    rating_true.loc[rating_true.rating>0],
    rating_pred,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='prediction',
    k=10
)

In [34]:
ndcg

0.3841584365970973

In [35]:
%%time

recall = evaluation.recall_at_k(
    rating_true.loc[rating_true.rating>0],
    rating_pred,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='prediction',
    k=10
)

precision = evaluation.precision_at_k(
    rating_true.loc[rating_true.rating>0],
    rating_pred,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='prediction',
    k=10
)

CPU times: user 23.8 s, sys: 458 ms, total: 24.3 s
Wall time: 24.1 s


In [36]:
recall, precision ## precision is 10 times lower??

(0.6670529801324503, 0.06670529801324505)

#### troubleshoot xyin's evaluation code

In [37]:
%%time

df_hit, df_hit_count, n_users = evaluation.mergeTrueAndPredWithRank(
    rating_true.loc[rating_true.rating>0],
    rating_pred,
    col_user='userID',
    col_item='itemID',
    col_rating='rating',
    col_pred='prediction',
    k=10
)

CPU times: user 11.3 s, sys: 1.3 s, total: 12.6 s
Wall time: 12.7 s


In [38]:
df_hit.head(3)

Unnamed: 0,userID,itemID,rnk
0,0,25,2
1,1,133,7
2,2,207,7


In [39]:
df_hit.shape, df_hit.userID.nunique() ## one userID could appera more than once??

((4029, 3), 3929)

In [40]:
df_hit_count.head(3)

Unnamed: 0,userID,hit,actual
0,0,1,1
1,1,1,1
2,2,1,1


In [41]:
df_hit_count.hit.unique(), df_hit_count.actual.unique(), df_hit_count.shape, df_hit_count.userID.nunique()

(array([1, 2]), array([1]), (3929, 3), 3929)

In [42]:
df_hit_count.loc[df_hit_count.hit==2]

Unnamed: 0,userID,hit,actual
9,12,2,1
11,14,2,1
16,23,2,1
20,30,2,1
81,116,2,1
...,...,...,...
3773,5774,2,1
3864,5925,2,1
3890,5972,2,1
3899,5987,2,1


In [43]:
df_hit.loc[df_hit.userID==12]

Unnamed: 0,userID,itemID,rnk
9,12,844,5
10,12,844,6


In [44]:
#### precision

(df_hit_count['hit']/10).sum() / n_users

0.06670529801324505

In [45]:
(df_hit_count['hit']).sum() / n_users

0.6670529801324503

#### troubleshoot the mergeTrueAndPredWithRank module
(line 48 - line 82)

In [46]:
rating_true.shape, rating_pred.shape 
## rating_true has all 100 items for each user, rating_pred only has top10

((604000, 3), (60400, 3))

In [48]:
rating_true.userID.nunique(), rating_pred.userID.nunique()

(6040, 6040)

In [51]:
## find shared users between pred and actual
common_users = set(rating_true[col_user].unique()).intersection(
    set(rating_pred[col_user].unique())
)
## clean the pred and actual based on the shared users
n_users = len(common_users)
rating_true_common = rating_true[rating_true[col_user].isin(common_users)]
rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)]
rating_true_common.shape, rating_pred_common.shape

((604000, 3), (60400, 3))

In [55]:
%%time

col_pred = 'prediction'
topK = 10
## get ranked pred output
topKItems = evaluation.getTopK(rating_pred_common,col_user,col_pred,topK)
topKItems.shape

CPU times: user 10.8 s, sys: 237 ms, total: 11 s
Wall time: 10.9 s


(60400, 4)

In [56]:
topKItems.head(3)

Unnamed: 0,userID,itemID,prediction,rnk
0,0,128,0.95447,1
1,0,25,0.787151,2
2,0,174,0.732144,3


In [58]:
rating_true_common.shape, rating_true_common.shape

((604000, 3), (604000, 3))

In [59]:
rating_true_common.head(3)

Unnamed: 0,userID,itemID,rating
0,0,25,1
1,0,1064,0
2,0,174,0


In [61]:
rating_true.loc[rating_true.rating>0,'userID'].agg(['count','nunique'])

count      6040
nunique    6040
Name: userID, dtype: int64

In [63]:
col_user = 'userID'
col_item = 'itemID'

## match ranked pred with actual
df_hit = pd.merge(
    topKItems,
    rating_true_common.loc[rating_true_common.rating>0],
    on=[col_user,col_item]
)[
    [col_user,col_item,'rnk']
]

In [64]:
df_hit.shape, df_hit.userID.nunique()

((4029, 3), 3929)

In [66]:
rating_true_common.loc[rating_true_common.rating>0,'userID'].agg(['count','nunique'])

count      6040
nunique    6040
Name: userID, dtype: int64

In [67]:
rating_true_common.loc[rating_true_common.rating>0].head(3)

Unnamed: 0,userID,itemID,rating
0,0,25,1
100,1,133,1
200,2,207,1


In [68]:
topKItems.shape, topKItems.userID.nunique()

((60400, 4), 6040)

In [70]:
topKItems.loc[topKItems.userID==12]

Unnamed: 0,userID,itemID,prediction,rnk
120,12,1018,0.973504,1
121,12,538,0.943523,2
122,12,737,0.763229,3
123,12,713,0.61711,4
124,12,844,0.539554,5
125,12,844,0.539554,6
126,12,208,0.50931,7
127,12,115,0.506679,8
128,12,1693,0.503947,9
129,12,396,0.249573,10


In [71]:
rating_true_common.loc[(rating_true_common.rating>0)&(rating_true_common.userID==12)]

Unnamed: 0,userID,itemID,rating
1200,12,844,1


In [72]:
#### there is duplicates in topK calculation????

In [73]:
rating_pred_common.loc[rating_pred_common.userID==12]

Unnamed: 0,userID,itemID,prediction
120,12,1018,0.973504
121,12,538,0.943523
122,12,737,0.763229
123,12,713,0.61711
124,12,844,0.539554
125,12,844,0.539554
126,12,208,0.50931
127,12,115,0.506679
128,12,1693,0.503947
129,12,396,0.249573


In [None]:
#### there is duplicates in rating_pred_common????

In [74]:
rating_pred.loc[rating_pred.userID==12]

Unnamed: 0,userID,itemID,prediction
120,12,1018,0.973504
121,12,538,0.943523
122,12,737,0.763229
123,12,713,0.61711
124,12,844,0.539554
125,12,844,0.539554
126,12,208,0.50931
127,12,115,0.506679
128,12,1693,0.503947
129,12,396,0.249573


In [75]:
#### there is duplicates in rating_pred

In [76]:
topKItems_test.loc[topKItems_test.userID==12]

Unnamed: 0,userID,itemID,prediction,rnk
120,12,1018,0.973504,1
121,12,538,0.943523,2
122,12,737,0.763229,3
123,12,713,0.61711,4
124,12,844,0.539554,5
125,12,844,0.539554,6
126,12,208,0.50931,7
127,12,115,0.506679,8
128,12,1693,0.503947,9
129,12,396,0.249573,10


In [None]:
#### there is duplicates in topKItems_test!!!

In [80]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

all_predictions_test.loc[
    (all_predictions_test.userID==12)&(all_predictions_test.itemID==844)
].sort_values(by='itemID')

Unnamed: 0,userID,itemID,prediction
1200,12,844,0.539554
1219,12,844,0.539554


In [81]:
#### there is duplicates in all_predictions_test!!!

In [84]:
rating_true.loc[(rating_true.userID==12)&(rating_true.itemID==844)]

Unnamed: 0,userID,itemID,rating
1200,12,844,1
1219,12,844,0


In [85]:
#### same itemID, but different score????

In [89]:
testRatings[10:15], len(testRatings)

([[10, 820], [11, 829], [12, 844], [13, 105], [14, 519]], 6040)

In [110]:
sorted(testNegatives[12])[10:30]

[543,
 713,
 737,
 747,
 844,
 912,
 932,
 1018,
 1034,
 1083,
 1108,
 1126,
 1180,
 1230,
 1239,
 1283,
 1296,
 1343,
 1402,
 1433]

#### there is bad data in the negative samples where one item could both be a positive and negative sample????

#### try the author's code, see if handles the bad data

In [111]:
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time
#from numba import jit, autojit

# Global variables that are shared across processes
_model = model4
_testRatings = testRatings
_testNegatives = testNegatives
_K = 10

idx = 12

rating = _testRatings[idx]
items = _testNegatives[idx]
u = rating[0]
gtItem = rating[1]
items.append(gtItem)
# Get prediction scores
map_item_score = {}
users = np.full(len(items), u, dtype = 'int32')
predictions = _model.predict([users, np.array(items)], 
                             batch_size=100, verbose=0)
for i in range(len(items)):
    item = items[i]
    map_item_score[item] = predictions[i]
items.pop()

# Evaluate top rank list
ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
hr = getHitRatio(ranklist, gtItem)
ndcg = getNDCG(ranklist, gtItem)

print(hr, ndcg)

1 0.3868528072345416


In [114]:
#### in xyin's code
df_hit.loc[df_hit.userID==12]

Unnamed: 0,userID,itemID,rnk
9,12,844,5
10,12,844,6


In [115]:
df_hit_count[df_hit_count.userID==12]

Unnamed: 0,userID,hit,actual
9,12,2,1


##### the author's code doesn't double count the item even if it appears in both the positive and negative samples (LUCKY!!!)

#### fix xyin's stats by marking hit=2 as hit=1

In [118]:
df_hit_count.loc[df_hit_count.hit==2,'hit']=1
precision = (df_hit_count['hit']/k).sum() / n_users
recall = (df_hit_count['hit']/df_hit_count['actual']).sum() / n_users
precision, recall #### recall matches the literature's hit rate

(0.0650496688741722, 0.6504966887417218)

In [119]:
df_hit.loc[df_hit.userID==12]

Unnamed: 0,userID,itemID,rnk
9,12,844,5
10,12,844,6


In [None]:
topKItems = df.groupby(col_user, as_index=False)\
    .apply(lambda items: items.nlargest(k,col_rating))\
        .reset_index(drop=True)
## append rank
topKItems['rnk'] = topKItems.groupby(col_user,sort=False).cumcount()+1

In [122]:
df_hit['rnk2'] = df_hit.groupby('userID',sort=False).cumcount()+1

In [123]:
df_hit.loc[df_hit.userID==12]

Unnamed: 0,userID,itemID,rnk,rnk2
9,12,844,5,1
10,12,844,6,2


In [124]:


# calculate discounted gain for hit items
df_dcg = df_hit.loc[df_hit.rnk2==1].copy()
# relevance in this case is always 1
df_dcg["dcg"] = 1 / np.log1p(df_dcg["rnk"])
# sum up discount gained to get discount cumulative gain
df_dcg = df_dcg.groupby(col_user, as_index=False, sort=False).agg({"dcg": "sum"})
# calculate ideal discounted cumulative gain
df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user])
df_ndcg["idcg"] = df_ndcg["actual"].apply(
    lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
)

# DCG over IDCG is the normalized DCG
ndcg = (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users
ndcg

0.3769124134981252

#### try the paper's code

In [51]:
import math
import heapq # for retrieval topK
import multiprocessing
import numpy as np
from time import time
#from numba import jit, autojit

# Global variables that are shared across processes
_model = None
_testRatings = None
_testNegatives = None
_K = None

def evaluate_model(model, testRatings, testNegatives, K, num_thread):
    """
    Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
    Return: score of each test rating.
    """
    global _model
    global _testRatings
    global _testNegatives
    global _K
    _model = model
    _testRatings = testRatings
    _testNegatives = testNegatives
    _K = K
        
    hits, ndcgs = [],[]
    if(num_thread > 1): # Multi-thread
        pool = multiprocessing.Pool(processes=num_thread)
        res = pool.map(eval_one_rating, range(len(_testRatings)))
        pool.close()
        pool.join()
        hits = [r[0] for r in res]
        ndcgs = [r[1] for r in res]
        return (hits, ndcgs)
    # Single thread
    for idx in trange(len(_testRatings)):
        (hr,ndcg) = eval_one_rating(idx)
        hits.append(hr)
        ndcgs.append(ndcg)      
    return (hits, ndcgs)

def eval_one_rating(idx):
    rating = _testRatings[idx]
    items = _testNegatives[idx]
    u = rating[0]
    gtItem = rating[1]
    items.append(gtItem)
    # Get prediction scores
    map_item_score = {}
    users = np.full(len(items), u, dtype = 'int32')
    predictions = _model.predict([users, np.array(items)], 
                                 batch_size=100, verbose=0)
    for i in range(len(items)):
        item = items[i]
        map_item_score[item] = predictions[i]
    items.pop()
    
    # Evaluate top rank list
    ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
    hr = getHitRatio(ranklist, gtItem)
    ndcg = getNDCG(ranklist, gtItem)
    return (hr, ndcg)

def getHitRatio(ranklist, gtItem):
    for item in ranklist:
        if item == gtItem:
            return 1
    return 0

def getNDCG(ranklist, gtItem):
    for i in range(len(ranklist)):
        item = ranklist[i]
        if item == gtItem:
            return math.log(2) / math.log(i+2)
    return 0

In [52]:
%%time

hrs,ndcgs = evaluate_model(model4, testRatings, testNegatives, 10, 1)

HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))


CPU times: user 1min 49s, sys: 3.03 s, total: 1min 52s
Wall time: 1min 49s


In [53]:
hrs[:10]

[1, 1, 1, 1, 0, 1, 1, 1, 1, 0]

In [54]:
ndcgs[:10]

[0.6309297535714574,
 0.33333333333333337,
 0.33333333333333337,
 0.6309297535714574,
 0,
 1.0,
 0.5,
 1.0,
 0.2890648263178878,
 0]

In [56]:
hr, ndcg = np.array(hrs).mean(), np.array(ndcgs).mean()

In [57]:
hr

0.6504966887417218

In [58]:
ndcg

0.37691241349812515

# Finally, we are able to reproduce the Paper's result with the author's dataset !!!!

## Next step will be check the corresponding result with xyin dataset