<a href="https://colab.research.google.com/github/ts01174755/DS_Project/blob/main/KG_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import math
import pprint
import csv

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import seaborn as sns

from collections import defaultdict, deque
from datetime import datetime, timedelta
from tqdm import tqdm
# from sklearn.utils import shuffle

In [None]:
%tensorflow_version 1.x

In [None]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

# Prepare Data

In [None]:
# Raw Data
data = [
    ['accountID1','buy','productID1'],
    ['accountID1','buy','productID2'],
    ['accountID1','buy','productID3'],
    ['accountID1','buy','productID4'],
    ['accountID1','buy','productID5'],
    ['accountID2','buy','productID1'],
    ['accountID2','buy','productID2'],
    ['accountID2','buy','productID3'],
    ['accountID3','buy','productID3'],
    ['accountID3','buy','productID4'],
    ['accountID3','buy','productID5'],
    ['productID1','category','cloth'],
    ['productID2','category','Pants'],
    ['productID3','category','coat'],
    ['productID4','category','ring'],
    ['productID5','category','hat'],
    ['productID6','category','cloth'],    # productID6是新商品
  ]
print(data[:3])

In [None]:
# 圖譜建構 - 由頂至下
kg_graph = defaultdict(lambda: defaultdict(set))
for data_ in tqdm(data):
    if data_[1] == 'buy':
        kg_graph['root']['account'].add(data_[0])
    kg_graph[str(data_[0])][str(data_[1])].add(str(data_[2]))

# 補丁 - new product
kg_graph['root']['account'].add('ADMIN')
kg_graph['ADMIN']['require'].add('productID6')
print(kg_graph)

In [None]:
# 基於路徑的推理
for id_ in tqdm(kg_graph['root']['account']):
    for product_ in kg_graph[id_]['buy']:
        for category_ in kg_graph[product_]['category']:
            kg_graph[id_]['require'].add(category_)
print()
print(kg_graph['accountID1'])

In [None]:
# Write FILE
with open(f"./kg_data.csv", 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=['Entity1', 'Relation', 'Entity2'])
    writer.writeheader()

    traversal = set()
    RDF = deque()
    RDF.append('root')
    while len(RDF) != 0:
        Entity1 = RDF.popleft()
        for Relation in kg_graph[Entity1].keys():
            for Entity2 in kg_graph[Entity1][Relation]:
                if (Entity1, Relation, Entity2) in traversal: continue
                traversal.add((Entity1, Relation, Entity2))
                RDF.append(Entity2)
                writer.writerow({
                    'Entity1': Entity1,
                    'Relation': Relation,
                    'Entity2': Entity2
                })

# KG_Model

In [None]:
import tensorflow as tf
import ampligraph
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model
from sklearn.utils import shuffle

print('Ampligraph version: {}'.format(ampligraph.__version__))

In [None]:
kg_data = pd.read_csv('./kg_data.csv')
print(kg_data.head())

In [None]:
# KG model training
model = ComplEx(
            batches_count=2, seed=17, epochs=100, k=50, eta=2,
            loss='multiclass_nll', loss_params={'margin': 1},
            optimizer = 'sgd', optimizer_params = {'lr': 3e-2, 'end_lr': 5e-5, 'decay_cycle': 30, 'decay_lr_rate': 1.5},
            regularizer = 'LP', regularizer_params={'p': 3, 'lambda':0.01},
            verbose = True
        )

model.fit(kg_data.to_numpy())

save_model(model, 'ComplEx.pkl')

In [None]:
# 人的需求分數
person_category_triples = [ 
      ['accountID2' for _ in range(5)],
      ['require' for _ in range(5)],
      ['cloth', 'Pants', 'coat', 'ring', 'hat']
  ]
hypothesis = np.column_stack(person_category_triples)
# print(hypothesis)

triple_score = model.predict(hypothesis)
print('Triple of interest:\n', hypothesis)
print('Triple Score:\n', triple_score)

In [None]:
 # 商品的類別分數
product_category_triples = [ 
      ['productID6' for _ in range(5)],
      ['category' for _ in range(5)],
      ['cloth', 'Pants', 'coat', 'ring', 'hat']
  ]
hypothesis = np.column_stack(product_category_triples)
# print(hypothesis)

triple_score = model.predict(hypothesis)
print('Triple of interest:\n', hypothesis)
print('Triple Score:\n', triple_score)

In [None]:
# 人對新商品的喜好分數
person_product_triples = [ 
      ['accountID1','accountID2','accountID3'],
      ['buy','buy','buy'],
      ['productID6','productID6','productID6']
  ]
hypothesis = np.column_stack(person_product_triples)
# print(hypothesis)

triple_score = model.predict(hypothesis)
print('Triple of interest:\n', hypothesis)
print('Triple Score:\n', triple_score)

In [None]:
# from ampligraph.utils import create_tensorboard_visualizations

# model = restore_model('ComplEx.pkl')

# create_tensorboard_visualizations(model, 'embeddings_model')

# Only run if using jupyter notebook 
# ! tensorboard --logdir='./embeddings_model'

# Visualizing

In [None]:
# 列舉所有 account - category 分數
person_category_triples = []
for account_ in ['accountID1','accountID2','accountID3']:
    for category_ in ['cloth', 'Pants', 'coat', 'ring', 'hat']:
        person_category_triples.append([account_, 'require', category_])

person_category_scores = model.predict(person_category_triples)
for ind_ in range(len(person_category_scores)):
  if person_category_scores[ind_] < 0: person_category_scores[ind_] = 0

print(person_category_scores)

# 建構雷達圖需要的資料格式
radar_dict = defaultdict(list)
radar_dict['group'] = ['accountID1','accountID2','accountID3']
for ind_, triple_ in enumerate(person_category_triples):
    category_ = triple_[2]
    radar_dict[category_].append(person_category_scores[ind_])

df = pd.DataFrame(radar_dict)

print(df)

In [None]:
# ------- PART 1: Create background
 
# number of variable
categories=list(df)[1:]
category_num = df.shape[1] - 1
 
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(category_num) * 2 * math.pi for n in range(category_num)]
angles += angles[:1]
 
# Initialise the spider plot
ax = plt.subplot(111, polar=True)
 
# If you want the first axis to be on top:
ax.set_theta_offset(math.pi / 2)
ax.set_theta_direction(-1)
 
# Draw one axe per variable + add labels
plt.xticks(angles[:-1], categories)
 
# Draw ylabels
ax.set_rlabel_position(0)
plt.yticks([1,2,3], ["1","2","3"], color="grey", size=7)
plt.ylim(0,int(max(person_category_scores)) +1)

# ------- PART 2: Add plots
 
# Plot each individual = each line of the data
# I don't make a loop, because plotting more than 3 groups makes the chart unreadable
 
# Ind1
values=df.loc[0].drop('group').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="group A")
ax.fill(angles, values, 'b', alpha=0.1)
 
# Ind2
values=df.loc[1].drop('group').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="group B")
ax.fill(angles, values, 'r', alpha=0.1)
 
# Ind2
values=df.loc[2].drop('group').values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=1, linestyle='solid', label="group B")
ax.fill(angles, values, 'g', alpha=0.1)
 
# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

# Show the graph
plt.show()