In [1]:
import numpy as np
import pandas as pd

from graphdatascience import GraphDataScience

from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.problem_transform import LabelPowerset

from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 100)


In [2]:
host = "bolt://3.84.61.96:7687"
user = "neo4j"
password= "multitasks-horns-decks"

gds = GraphDataScience(host, auth=(user, password))

In [3]:
G, metadata = gds.graph.project('tags', ['Article','List'], {'IN_LIST': {'orientation':'UNDIRECTED'}})

In [4]:
%%time
gds.fastRP.write(G, writeProperty='fastrp', embeddingDimension=256)

CPU times: user 31.9 ms, sys: 14.8 ms, total: 46.7 ms
Wall time: 4.32 s


nodeCount                                                                                                              59304
nodePropertiesWritten                                                                                                  59304
preProcessingMillis                                                                                                        0
computeMillis                                                                                                            319
writeMillis                                                                                                             3370
configuration            {'writeConcurrency': 4, 'nodeSelfInfluence': 0, 'propertyRatio': 0.0, 'concurrency': 4, 'jobId':...
Name: 0, dtype: object

In [5]:
G.drop()

graphName                                                                                                               tags
database                                                                                                               neo4j
memoryUsage                                                                                                                 
sizeInBytes                                                                                                               -1
nodeCount                                                                                                              59304
relationshipCount                                                                                                     174590
configuration            {'relationshipProjection': {'IN_LIST': {'orientation': 'UNDIRECTED', 'indexInverse': False, 'agg...
density                                                                                                              0.00005


In [6]:
# Tag relevant tags
gds.run_cypher("""
MATCH (t:Tag)
WHERE count{(t)<--()} > 100
SET t:Target
""")

In [7]:
data = gds.run_cypher("""
MATCH (a:Article)-[:HAS_TAG]->(tag:Target)
RETURN a.url AS article, a.fastrp AS embedding, collect(tag.name) AS tags
""")

In [8]:
data.head()

Unnamed: 0,article,embedding,tags
0,https://medium.com/towards-data-science/the-data-scientist-of-the-future-according-to-google-4d8...,"[0.0003049038350582123, 0.04504062235355377, 0.0059824008494615555, -0.012860724702477455, 0.057...","[machine-learning, data-science, technology, artificial-intelligence, data]"
1,https://medium.com/towards-artificial-intelligence/how-i-prepared-for-my-amazon-data-scientist-i...,"[0.1175345927476883, 0.03312401473522186, -0.1383334845304489, 0.03454393148422241, 0.1424379050...","[machine-learning, data-science, interview]"
2,https://medium.com/towards-data-science/the-concept-of-transformers-and-training-a-transformers-...,"[0.07622800022363663, 0.050666384398937225, -0.06187001243233681, 0.04393763095140457, -0.103499...","[machine-learning, deep-learning, artificial-intelligence, nlp, naturallanguageprocessing]"
3,https://medium.com/@odsc/assessment-metrics-for-clustering-algorithms-4a902e00d92d,"[0.010879196226596832, 0.038660190999507904, -0.048736028373241425, -0.0357050783932209, -0.0263...","[machine-learning, data-science, technology, data, clustering]"
4,https://medium.com/@sebastian-orbell/discrete-latent-spaces-in-deep-generative-models-1c910e3b3907,"[0.03544364497065544, 0.058862850069999695, 0.05388859659433365, -0.059305962175130844, -0.03277...","[machine-learning, deep-learning]"


In [9]:
#instantiating MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_mlb = mlb.fit_transform(data['tags'])
data['target'] = list(tags_mlb)
data.head()

Unnamed: 0,article,embedding,tags,target
0,https://medium.com/towards-data-science/the-data-scientist-of-the-future-according-to-google-4d8...,"[0.0003049038350582123, 0.04504062235355377, 0.0059824008494615555, -0.012860724702477455, 0.057...","[machine-learning, data-science, technology, artificial-intelligence, data]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,https://medium.com/towards-artificial-intelligence/how-i-prepared-for-my-amazon-data-scientist-i...,"[0.1175345927476883, 0.03312401473522186, -0.1383334845304489, 0.03454393148422241, 0.1424379050...","[machine-learning, data-science, interview]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,https://medium.com/towards-data-science/the-concept-of-transformers-and-training-a-transformers-...,"[0.07622800022363663, 0.050666384398937225, -0.06187001243233681, 0.04393763095140457, -0.103499...","[machine-learning, deep-learning, artificial-intelligence, nlp, naturallanguageprocessing]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,https://medium.com/@odsc/assessment-metrics-for-clustering-algorithms-4a902e00d92d,"[0.010879196226596832, 0.038660190999507904, -0.048736028373241425, -0.0357050783932209, -0.0263...","[machine-learning, data-science, technology, data, clustering]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,..."
4,https://medium.com/@sebastian-orbell/discrete-latent-spaces-in-deep-generative-models-1c910e3b3907,"[0.03544364497065544, 0.058862850069999695, 0.05388859659433365, -0.059305962175130844, -0.03277...","[machine-learning, deep-learning]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [10]:
X = np.array(data['embedding'].to_list())
y = np.array(data['target'].to_list())

x_train, y_train, x_test, y_test = iterative_train_test_split(X, y, test_size = 0.2)

print(x_train.shape)
print(y_train.shape)

(20280, 256)
(20280, 161)


In [11]:
%%time
# https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff
# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(x_train, y_train)
# predict
predictions = classifier.predict(x_test)


CPU times: user 18min 9s, sys: 6min 6s, total: 24min 16s
Wall time: 10min 38s


In [12]:
print('Test accuracy is {}'.format(accuracy_score(y_test, predictions)))

Test accuracy is 0.04196519959058342


In [13]:
#https://www.kaggle.com/code/kmkarakaya/multi-label-model-evaluation
classes = mlb.classes_

totalPrecision= 0
for i in range(len(classes)):
  p = precision_score(y_test[:,i], predictions.toarray()[:,i])
  totalPrecision+= p
  #print("For {} precision: {:.2f}".format(classes[i], p))
print("Macro Precision: {:.2f}".format(totalPrecision/len(classes)))

Macro Precision: 0.38


In [14]:
totalPrecision=0
totalSupport=0
for i in range (len(classes)):
  p= precision_score(y_test[:,i], predictions.toarray()[:,i])
  support= (y_test[:,i]==1).sum()
  totalSupport+=support
  totalPrecision+= p*support
  #print("For {} precision: {:.2f} support: {}".format(classes[i], p, support ))
print("Weighted Precision: {:.2f}".format(totalPrecision/totalSupport))

Weighted Precision: 0.41


In [15]:
preds = gds.run_cypher("""
MATCH (a:Article)
WHERE NOT EXISTS {(a)-[:HAS_TAG]->()}
RETURN a.title AS title, a.fastrp AS embedding
LIMIT 15
""")

In [16]:
new_preds = classifier.predict(np.array(preds['embedding'].to_list()))
preds['tags'] = [list(mlb.inverse_transform(x)[0]) for x in new_preds]
preds

Unnamed: 0,title,embedding,tags
0,Scaling Applications with Constant Work Pattern,"[0.0002030273899435997, 0.09359394013881683, -0.05615721270442009, -0.10241050273180008, -0.1346...",[distributed-systems]
1,VQ-Diffusion: Microsoft’s New Text-To-Image AI Tool,"[-0.06371483206748962, -0.020012132823467255, 0.0339442677795887, -0.004839659668505192, -0.1673...",[machine-learning]
2,My Learnings on Snowflake,"[0.053007572889328, -0.12442155182361603, -0.021111629903316498, 0.13774339854717255, 0.05044498...",[data]
3,Google aims at Excel with new Big Data Features,"[0.047905296087265015, 0.16040661931037903, 0.10157479345798492, 0.005116358399391174, 0.0300154...",[machine-learning]
4,Resolving conflicts when merging release to develop,"[-0.0888320580124855, 0.058204032480716705, 0.2668755352497101, -0.1405196338891983, 0.135354667...",[python]
5,Euler’s method simply explained,"[-0.057137422263622284, 0.1729966551065445, 0.03504607453942299, -0.06592682003974915, 0.0192024...",[python]
6,Gold Plating,"[0.1429712325334549, 0.03575993701815605, 0.0715198740363121, 0.1429712325334549, 0.035691417753...",[kubernetes]
7,Don’t Choke Like a Monkey In Your Next Business Pitch,"[0.0604727528989315, -0.12482210993766785, 0.0013470649719238281, 0.049697037786245346, -0.03570...",[python]
8,You Need This Jacket In Your Winter Wardrobe,"[-0.15221984684467316, 0.15376326441764832, 0.015957683324813843, -0.26392364501953125, -0.13626...",[python]
9,Spring Security (Part 1),"[0.011850538663566113, 0.012177404016256332, -0.1848328411579132, -0.03200909122824669, 0.079235...",[spring-boot]
