In [1]:
!pip install graphdatascience scikit-multilearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd

from graphdatascience import GraphDataScience

from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.problem_transform import LabelPowerset

from sklearn.metrics import accuracy_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 100)


In [3]:
host = "bolt://3.84.61.96:7687"
user = "neo4j"
password= "multitasks-horns-decks"

gds = GraphDataScience(host, auth=(user, password))

In [4]:
G, metadata = gds.graph.project('tags', ['Article','List'], {'IN_LIST': {'orientation':'UNDIRECTED'}})

In [5]:
gds.fastRP.write(G, writeProperty='fastrp', embeddingDimension=256)

nodeCount                                                                                                              59304
nodePropertiesWritten                                                                                                  59304
preProcessingMillis                                                                                                        0
computeMillis                                                                                                            232
writeMillis                                                                                                             4297
configuration            {'writeConcurrency': 4, 'nodeSelfInfluence': 0, 'propertyRatio': 0.0, 'concurrency': 4, 'jobId':...
Name: 0, dtype: object

In [6]:
G.drop()

graphName                                                                                                               tags
database                                                                                                               neo4j
memoryUsage                                                                                                                 
sizeInBytes                                                                                                               -1
nodeCount                                                                                                              59304
relationshipCount                                                                                                     174590
configuration            {'relationshipProjection': {'IN_LIST': {'orientation': 'UNDIRECTED', 'indexInverse': False, 'agg...
density                                                                                                              0.00005


In [7]:
# Tag relevant tags
gds.run_cypher("""
MATCH (t:Tag)
WHERE count{(t)<--()} > 100
SET t:Target
""")

In [8]:
data = gds.run_cypher("""
MATCH (a:Article)-[:HAS_TAG]->(tag:Target)
WHERE a.embedding IS NOT NULL
RETURN a.url AS article, a.fastrp AS fastrp, collect(tag.name) AS tags, a.embedding AS openai
""")

In [9]:
#instantiating MultiLabelBinarizer
mlb = MultiLabelBinarizer()
tags_mlb = mlb.fit_transform(data['tags'])
data['target'] = list(tags_mlb)
data.head()

Unnamed: 0,article,fastrp,tags,openai,target
0,https://medium.com/towards-data-science/the-data-scientist-of-the-future-according-to-google-4d8...,"[0.17759761214256287, -0.07900823652744293, -0.04278559237718582, -0.008180446922779083, -0.1974...","[machine-learning, data-science, technology, artificial-intelligence, data]","[-0.026673702523112297, -0.011067028157413006, 0.012379064224660397, -0.015823161229491234, -0.0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,https://medium.com/gitconnected/setup-jupyter-notebook-on-your-local-machine-and-use-it-from-any...,"[-0.23205816745758057, -0.05551952123641968, -0.01987866684794426, 0.12418968975543976, -0.10935...","[machine-learning, data-science, artificial-intelligence, mlops, productivity]","[-0.027442459017038345, 0.001837967080064118, 0.04122466966509819, -0.0338524729013443, 0.000622...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,https://medium.com/towards-data-science/a-step-by-step-guide-to-feature-engineering-for-multivar...,"[0.023249246180057526, -0.00963681936264038, -0.05827200412750244, -0.08264008164405823, -0.0830...","[machine-learning, data-science, time-series-forecasting, artificial-intelligence, time-series-a...","[-0.007429220248013735, -0.0026638966519385576, 0.028344519436359406, -0.029822446405887604, -0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,https://medium.com/towards-data-science/kubeflow-is-not-for-dummies-414d8977158a,"[0.05011122673749924, -0.0743846446275711, -0.09269026666879654, 0.02038072794675827, -0.0973558...","[machine-learning, data-science, software-development, aws, kubernetes]","[-0.00787402130663395, -0.006389803718775511, 0.02791445143520832, -0.03439484164118767, -0.0116...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,https://medium.com/@alxmamaev/generating-music-with-ai-or-transformers-go-brrrr-3a3ac5a04126,"[0.10603606700897217, 0.020510291680693626, -0.06950496137142181, -0.11833038926124573, 0.020202...","[machine-learning, transformers, deep-learning, nlp]","[-0.007013104856014252, -0.030339157208800316, -0.00031991058494895697, -0.019291024655103683, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [10]:
def get_macro_precision(classes, y_true, y_pred):
    totalPrecision= 0
    for i in range(len(classes)):
      p = precision_score(y_true[:,i], y_pred.toarray()[:,i])
      totalPrecision+= p
    return totalPrecision/len(classes)

def get_weighted_precision(classes, y_true, y_pred):
    totalPrecision=0
    totalSupport=0
    for i in range (len(classes)):
      p= precision_score(y_true[:,i], y_pred.toarray()[:,i])
      support= (y_test[:,i]==1).sum()
      totalSupport+=support
      totalPrecision+= p*support
    return totalPrecision/totalSupport

In [11]:
X = data[['fastrp','openai']].values
y = np.array(data['target'].to_list())
x_train_all, y_train, x_test_all, y_test = iterative_train_test_split(X, y, test_size = 0.2)

In [12]:
def train_and_evaluate(df, input_columns):
  max_weighted_precision = 0
  best_input = ""
  # Single split data
  X = data[input_columns].values
  y = np.array(data['target'].to_list())
  x_train_all, y_train, x_test_all, y_test = iterative_train_test_split(X, y, test_size = 0.2)
  # Train a model for each input option
  for i, input_column in enumerate(input_columns):
    print(f"Training a model based on {input_column} column")
    x_train = np.array([x[i] for x in x_train_all])
    x_test = np.array([x[i] for x in x_test_all])

    # train
    classifier = LabelPowerset(LogisticRegression())
    classifier.fit(x_train, y_train)
    # predict
    predictions = classifier.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test, predictions)))
    print("Macro Precision: {:.2f}".format(get_macro_precision(mlb.classes_, y_test, predictions)))
    weighted_precision = get_weighted_precision(mlb.classes_, y_test, predictions)
    print("Weighted Precision: {:.2f}".format(weighted_precision))
    if weighted_precision > max_weighted_precision:
      max_weighted_precision = weighted_precision
      best_classifier = classifier
      best_input = input_column
      
  return best_classifier, best_input

In [13]:
%%time
classifier, best_input = train_and_evaluate(data, ['openai','fastrp'])

Training a model based on openai column
Test accuracy is 0.023135033050047216
Macro Precision: 0.09
Weighted Precision: 0.28
Training a model based on fastrp column
Test accuracy is 0.03588290840415486
Macro Precision: 0.32
Weighted Precision: 0.36
CPU times: user 19min 40s, sys: 1min 40s, total: 21min 20s
Wall time: 12min 50s


In [14]:
example = gds.run_cypher("""
MATCH (a:Article)
WHERE NOT EXISTS {(a)-[:HAS_TAG]->()}
RETURN a.title AS title, a.fastrp AS fastrp, a.embedding AS openai
LIMIT 15
""")

In [16]:
tags_predicted = classifier.predict(np.array(example[best_input].to_list()))
example['tags'] = [list(mlb.inverse_transform(x)[0]) for x in tags_predicted]
example[['title', 'tags']]

Unnamed: 0,title,tags
0,Scaling Applications with Constant Work Pattern,[distributed-systems]
1,VQ-Diffusion: Microsoft’s New Text-To-Image AI Tool,"[artificial-intelligence, data-science, machine-learning, programming, python]"
2,My Learnings on Snowflake,[data]
3,Google aims at Excel with new Big Data Features,[data-science]
4,Resolving conflicts when merging release to develop,[data-science]
5,Euler’s method simply explained,[machine-learning]
6,Gold Plating,[python]
7,Don’t Choke Like a Monkey In Your Next Business Pitch,[python]
8,You Need This Jacket In Your Winter Wardrobe,"[data-science, machine-learning]"
9,Spring Security (Part 1),[spring-boot]
