In [79]:
import pandas as pd
import matplotlib.pyplot as plt

In [80]:
from neo4j import GraphDatabase

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "1234qwer!")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

tx = driver.session()
tx

  # Remove the CWD from sys.path while we load stuff.


<neo4j._sync.work.session.Session at 0x7fdfb001ca90>

In [81]:
def create_external_source_resource(tx, node):
    """
        Create ExternalSource Node
    """
    # logger.info(
    #     "Creating ExternalSource resource '%s'" % node["id"])
    tx.run(
        """MERGE (c:Resource:ExternalSource {rid: $rid, uri: $uri, 
        publish_time: $created_at, cid: $cid, description: $description, helpful_count: $helpful_count,
        not_helpful_count: $not_helpful_count, bookmark_count: $bookmark_count})""",
        rid=node["uri"],
        uri=node["uri"],
        publish_time=node["created_at"],
        description=node["description"],
        cid=node["cid"],
        helpful_count=0,
        not_helpful_count=0,
        bookmark_count=0
        )

def cro_get_ratings(resources_rid: list):
    """
        Getting List of Rating Containing Resources RID
    """
    # logger.info("Getting List of Rating Containing Resources RID")

    result = []
    node_ids = resources_rid
    with driver.session() as session:
        result = session.run(
            """
            MATCH (r:Rating_CRO)
            WHERE ID(r) IN $node_ids
            RETURN ID(r) as node_id, r.user_id as user_id, r.cid as cid, r.value as value, r.resource_rid as resource_rid
            """,
            node_ids=node_ids
        ).data()
    
    return result

def cro_get_resources(concepts_cro: list):
    """
        Getting List of Resources Containing Concept_CRO
    """
    # logger.info("Getting List of Resources Containing Concept_CRO")

    result = []
    node_ids = [node["node_id"] for node in concepts_cro]
    with driver.session() as session:
        result = session.run(
            """
            MATCH p=(a:Resource)-[r:CONTAINS_CRO]->(b:Concept_CRO)
            WHERE ID(b) IN $node_ids
            RETURN  LABELS(a) as labels, ID(a) as id, a.rid as rid, a.title as title, a.text as text,
                    a.thumbnail as thumbnail, a.abstract as abstract, a.post_date as post_date, 
                    a.author_image_url as author_image_url, a.author_name as author_name,
                    a.keyphrases as keyphrases, a.description as description, a.description_full as description_full,
                    a.views as views, a.publish_time as publish_time, a.uri as uri, a.duration as duration,
                    a.similarity_score as similarity_score, a.helpful_count as helpful_count, a.not_helpful_count as not_helpful_count,
                    a.bookmarked_count as bookmarked_count
            """,
            node_ids=node_ids
        ).data()

        if result:
            for resource in result:
                # print([key for key, value in resource.items() ])

                r = {
                    "id": resource["id"],
                    "title": resource["title"],
                    "rid": resource["rid"],
                    "uri": resource["uri"],
                    "helpful_count": int(resource["helpful_count"]),
                    "not_helpful_count": int(resource["not_helpful_count"]),
                    "labels": resource["labels"],
                    "similarity_score": float(resource["similarity_score"]),
                    "keyphrases": resource["keyphrases"],
                    "text": resource["text"],
                    "bookmarked_count": int(resource["bookmarked_count"])
                }

                if "Video" in r["labels"]:
                    r["description"] = resource["description"]
                    r["description_full"] = resource["description_full"]
                    r["thumbnail"] = resource["thumbnail"]
                    r["duration"] = resource["duration"]
                    r["views"] = int(resource["views"])
                    r["publish_time"] = resource["publish_time"]

                elif "Article" in r["labels"]:
                    r["abstract"] = resource["abstract"]

                elif "ExtermalSource" in r["labels"]:
                    r["description"] = resource["description"]
                    r["publish_time"] = resource["publish_time"]

    return result


In [82]:
concepts_cro = [    {"node_id": 2, "cid": "195462609980330371"}
                ]
resources = cro_get_resources(concepts_cro)



In [83]:
len(resources)
# resources

32

In [84]:
resources_rid = [node["rid"] for node in resources]
ratings = cro_get_ratings(resources_rid=resources_rid)
len(ratings)



0

In [85]:
import math
import scipy.stats as st

from datetime import datetime
from dateutil.parser import parse as date_parse


def wilson_lower_bound(up, down, confidence=0.95):
    """
        Calculate lower bound of wilson score
        :param up: No of positive ratings
        :param down: No of negative ratings
        :param confidence: Confidence interval, by default is 95 %
        :return: Wilson Lower bound score
    """
    n = up + down
    if n == 0:
        return 0.0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

def normalized_score_date(date_str: str, max: datetime):
    """
        Calculate Normalization Score of Creation Date
    """
    date = date_parse(date_str).replace(tzinfo=None)

    # First video posted on Youtube
    min = datetime(year=2005, month=4, day=23, hour=8, minute=31, second=52, tzinfo=None)
    return (date - min).days / (max - min).days

def normalize_factor(value: int, min_value: int, max_value: int):
    if (max_value - min_value)== 0:
        return 0
    return (value - min_value) / (max_value - min_value)

def normalize_factor_2(value: int, min_value: int, max_value: int):
    if max_value == 0:
        return 0
    return value / max_value

def calculate_factors_weights(category: int, resources: list, weights: dict = None, light=False):
    """
        Sort by these extra features provided by the resources such as:

        Type: 1 => Video
        Popularity | Views (videos) -> 0.4
        Rating: Likes Count Dislike Count -> 0.2
        Creation Date (content freshness) -> 0.2
        Similarities Scores -> 0.1
        Bookmark -> 0.1
        Certified Account (video) -> 0.05 (optional)

        Type: 1 => Article
        Rating: Likes Count Dislike Count -> 0.4
        Similarities Scores -> 0.3
        Bookmark -> 0.3

        Resources having Rating related to DNU_modified (cid)
    """
    now = datetime.now()
    weight_views = weights.get("views")
    weight_rating = weights.get("rating")
    weight_creation_date = weights.get("creation_date")
    weight_similarity_score = weights.get("similarity_score")
    weight_bookmark = weights.get("bookmark")

    min_bookmarked = min(resources, key=lambda x: x["bookmarked_count"])["bookmarked_count"]
    max_bookmarked = max(resources, key=lambda x: x["bookmarked_count"])["bookmarked_count"]
    
    if category == 1:
        min_views = int(min(resources, key=lambda x: int(x["views"]))["views"])
        max_views = int(max(resources, key=lambda x: int(x["views"]))["views"])
        # print("min_views & max_views ->", (min_views, max_views))

        for resource in resources:
            rating_score = wilson_lower_bound(up=resource["helpful_count"], down=resource["not_helpful_count"])
            normalized_score = normalized_score_date(date_str=resource["publish_time"], max=now)
            views_score = normalize_factor(value=int(resource["views"]), min_value=min_views, max_value=max_views) 
            bookmarked_score = normalize_factor(value=int(resource["bookmarked_count"]), min_value=min_bookmarked, max_value=max_bookmarked) 
            
            resource["composite_score"] = (views_score * weight_views) \
                                        + (rating_score * weight_rating) \
                                        + (normalized_score * weight_creation_date) \
                                        + (resource["similarity_score"] * weight_similarity_score) \
                                        + (bookmarked_score * weight_bookmark)
            
            resource["composite_score_product"] = (views_score * weight_views) \
                                        * (rating_score * weight_rating) \
                                        * (normalized_score * weight_creation_date) \
                                        * (resource["similarity_score"] * weight_similarity_score) \
                                        * (bookmarked_score * weight_bookmark)

    elif category == 2:
        for resource in resources:
            rating_score = wilson_lower_bound(up=resource["helpful_count"], down=resource["not_helpful_count"])
            resource["composite_score"] = (rating_score * weight_rating) \
                                        + (resource["similarity_score"] * weight_similarity_score) \
                                        + (resource["bookmarked_count"] * weight_bookmark) \

    """                                      
    elif category == 3:
        # weight_rating = 0.4
        # weight_bookmark = 0.3
        # weight_creation_date = 0.3

        for resource in resources:
            rating_score = wilson_lower_bound(up=resource["helpful_count"], down=resource["not_helpful_count"])
            resource["composite_score"] = (rating_score * weight_rating) \
                                        + (resource["bookmarked_count"] * weight_bookmark) \
                                        + (normalized_score * weight_creation_date)
    """

    # sort by composite score value
    resources.sort(key=lambda x: x["composite_score"], reverse=True)
    return resources

def cro_sort_result(resources: list, weights: dict = None, ratings: list = None):
    """
        Sorting Logic for Resources 
        Resources having Rating related to DNU_modified (cid)
    """

    # video items
    resources_videos = [resource for resource in resources if "Video" in resource["labels"]]
    resources_videos = calculate_factors_weights(category=1, resources=resources_videos, weights=weights["video"])

    # articles items
    resources_articles = [resource for resource in resources if "Article" in resource["labels"]]
    resources_articles = calculate_factors_weights(category=2, resources=resources_articles, weights=weights["article"])

    # # Finally, priorities on resources having Rating related to DNU_modified (cid)
    if ratings and len(ratings) > 0:
        pass

    return {
        "articles": resources_articles,
        "videos": resources_videos
    }

def cro_sort_result_light(params: dict, with_ratings=False):
    """
        params: {"similarity_score": True, "most_recent": False, "popularity": True, "concepts_cids": ["sdsds323", "23asdf23"]}
        most_recent: creation date
        popularity: based on composite scores: views (0.4), ratings (0.4), bookmarked count (0.2)
    """
    resources = cro_get_resources(concepts_cro=params["concepts_cids"])

    if params["similarity_score"] == False and params["most_recent"] and params["popularity"]:
        return cro_sort_result(resources=resources, with_ratings=True)

    if params["similarity_score"] == True:
        resources.sort(key=lambda x: x["similarity_score"], reverse=True)

    if params["most_recent"] == True:
        resources.sort(key=lambda x: x["publish_time"], reverse=True)

    if params["popularity"] == True:
        resources.sort(key=lambda x: x["views"], reverse=True)
    
    resources_videos = [resource for resource in resources if "Video" in resource["labels"]]
    resources_articles = [resource for resource in resources if "Article" in resource["labels"]]
    resources_external_sources = [resource for resource in resources if "ExternalSource" in resource["labels"]]
    return {
        "articles": resources_articles,
        "videos": resources_videos,
        "external_sources": resources_external_sources
    }


In [86]:
# params = {"similarity_score": True, "most_recent": False, "popularity": True, "concepts_cids": ["195462609980330371"]}
# resources = cro_sort_result_light(params=params, with_ratings=False)

In [87]:
def display(resources, typ: str):
    new_cols = [    "rid" , "views", "similarity_score", "publish_time", "helpful_count", "not_helpful_count" , 
                    "bookmarked_count", "composite_score", "composite_score_product"
                ]
    df = pd.DataFrame(resources[str(typ)])
    df = df.reindex(columns=new_cols)
    return df.head(30)

In [88]:
def cro_sort_result_ORIGIN(resources):
    resources_videos = [resource for resource in resources if "Video" in resource["labels"]]
    resources_videos.sort(key=lambda x: x["similarity_score"], reverse=True)
    
    resources_articles = [resource for resource in resources if "Article" in resource["labels"]]
    resources_external_sources = [resource for resource in resources if "ExternalSource" in resource["labels"]]
    return {
        "articles": resources_articles,
        "videos": resources_videos,
        "external_sources": resources_external_sources
    }

resources_org = cro_sort_result_ORIGIN(resources[:])
display(resources_org, typ="videos")
# display(resources_org, typ="articles")

Unnamed: 0,rid,views,similarity_score,publish_time,helpful_count,not_helpful_count,bookmarked_count,composite_score,composite_score_product
0,KpGtax2RBVY,8631,0.918252,2021-06-22T14:04:39Z,0,0,0,,
1,kxx-SV5Rx_c,2290,0.917859,2020-02-04T12:17:19Z,0,0,0,,
2,qvOJDfQk3Nw,3631,0.914626,2021-04-19T14:31:41Z,15,0,0,,
3,_wdDGq1PrNw,3310,0.912909,2018-08-15T06:34:35Z,0,0,0,,
4,8ZSdpZaqyj8,38,0.909533,2024-03-31T12:35:04Z,0,0,0,,
5,XscUZ8dIa-8,18655,0.902418,2019-08-28T03:05:46Z,0,1,0,,
6,3QzPFFCKgfE,366,0.900829,2024-03-31T11:00:02Z,0,0,0,,
7,ypplDa2B-QA,2345,0.897124,2023-01-27T07:49:03Z,0,0,0,,
8,LfXDzpTnvqY,11886,0.896477,2013-06-18T13:43:02Z,15,2,0,,
9,0WF4kqHCUac,888,0.896301,2021-10-03T19:16:20Z,0,0,0,,


In [89]:
weights = {
    "video": {
        "views": 0.2,
        "rating": 0.1,
        "creation_date": 0.4,
        "similarity_score": 0.1,
        "bookmark": 0.1,
    },
    "article": {
        "rating": 0.4,
        "similarity_score": 0.3,
        "bookmark": 0.3,
    }
}
resources_cro = cro_sort_result(resources=resources, weights=weights)
display(resources_cro, typ="videos")
# display(resources_cro, typ="articles")

Unnamed: 0,rid,views,similarity_score,publish_time,helpful_count,not_helpful_count,bookmarked_count,composite_score,composite_score_product
0,qvOJDfQk3Nw,3631,0.914626,2021-04-19T14:31:41Z,15,0,0,0.533307,0.0
1,XscUZ8dIa-8,18655,0.902418,2019-08-28T03:05:46Z,0,1,0,0.524605,0.0
2,idHxNSTZhNM,28178,0.873485,2014-10-13T21:44:49Z,2,1,0,0.507596,0.0
3,KpGtax2RBVY,8631,0.918252,2021-06-22T14:04:39Z,0,0,0,0.493285,0.0
4,3QzPFFCKgfE,366,0.900829,2024-03-31T11:00:02Z,0,0,0,0.491203,0.0
5,8ZSdpZaqyj8,38,0.909533,2024-03-31T12:35:04Z,0,0,0,0.489743,0.0
6,DHZfo9oxVIs,381,0.856183,2024-03-27T19:36:51Z,0,0,0,0.486615,0.0
7,ypplDa2B-QA,2345,0.897124,2023-01-27T07:49:03Z,0,0,0,0.480107,0.0
8,0WF4kqHCUac,888,0.896301,2021-10-03T19:16:20Z,0,0,0,0.441996,0.0
9,IaT4WzmlIJs,969,0.877712,2021-07-08T13:58:03Z,0,0,0,0.435697,0.0


In [90]:
weights = {
    "video": {
        "views": 0.2,
        "rating": 0.2,
        "creation_date": 0.2,
        "similarity_score": 0.2,
        "bookmark": 0.2,
    },
    "article": {
        "rating": 0.4,
        "similarity_score": 0.3,
        "bookmark": 0.3,
    }
}
resources_cro = cro_sort_result(resources=resources, weights=weights)
display(resources_cro, typ="videos")
# display(resources_cro, typ="articles")

Unnamed: 0,rid,views,similarity_score,publish_time,helpful_count,not_helpful_count,bookmarked_count,composite_score,composite_score_product
0,qvOJDfQk3Nw,3631,0.914626,2021-04-19T14:31:41Z,15,0,0,0.536033,0.0
1,idHxNSTZhNM,28178,0.873485,2014-10-13T21:44:49Z,2,1,0,0.515969,0.0
2,LfXDzpTnvqY,11886,0.896477,2013-06-18T13:43:02Z,15,2,0,0.480676,0.0
3,XscUZ8dIa-8,18655,0.902418,2019-08-28T03:05:46Z,0,1,0,0.463824,0.0
4,KpGtax2RBVY,8631,0.918252,2021-06-22T14:04:39Z,0,0,0,0.414917,0.0
5,ypplDa2B-QA,2345,0.897124,2023-01-27T07:49:03Z,0,0,0,0.382821,0.0
6,3QzPFFCKgfE,366,0.900829,2024-03-31T11:00:02Z,0,0,0,0.381892,0.0
7,8ZSdpZaqyj8,38,0.909533,2024-03-31T12:35:04Z,0,0,0,0.381301,0.0
8,DHZfo9oxVIs,381,0.856183,2024-03-27T19:36:51Z,0,0,0,0.372954,0.0
9,KqETXdq68vY,16373,0.853527,2013-02-17T18:42:49Z,0,0,0,0.369162,0.0


In [91]:
xxxx

NameError: name 'xxxx' is not defined

In [None]:
# result = tx.run("""
#     MATCH (a:Resource)
#     RETURN  LABELS(a) as labels, ID(a) as id, a.rid as rid, a.title as title, a.text as text,
#             a.thumbnail as thumbnail, a.abstract as abstract, a.post_date as post_date, 
#             a.author_image_url as author_image_url, a.author_name as author_name,
#             a.keyphrases as keyphrases, a.description as description, a.description_full as description_full,
#             a.views as views, a.publish_time as publish_time, a.uri as uri, a.duration as duration,
#             a.similarity_score as similarity_score, a.helpful_count as helpful_count, a.not_helpful_count as not_helpful_count
#     """
# ).data()

In [None]:
result = tx.run("""
    MATCH (a:Resource)
    WHERE a.rid = $rid
    RETURN  LABELS(a) as labels, ID(a) as id, a.rid as rid, a.title as title, a.text as text,
            a.thumbnail as thumbnail, a.abstract as abstract, a.post_date as post_date, 
            a.author_image_url as author_image_url, a.author_name as author_name,
            a.keyphrases as keyphrases, a.description as description, a.description_full as description_full,
            a.views as views, a.publish_time as publish_time, a.uri as uri, a.duration as duration,
            a.similarity_score as similarity_score, a.helpful_count as helpful_count, a.not_helpful_count as not_helpful_count
    """,
    rid="3QzPFFCKgfE"
).single()

In [None]:
date = result["publish_time"]
print(date)
print(type(date))
result

In [None]:
xxxxxx

In [None]:
import pandas as pd

df = pd.read_csv("export.csv")
rids = df["rid"].tolist()
rids

In [None]:
# rids = ["1ohlAZfCe88"]

In [None]:
for rid in rids:
    node = tx.run(
        """
        MATCH (r:Resource)
        WHERE r.rid = $rid
        SET r.bookmarked_count = $value
        RETURN r
        """,
        rid=rid,
        value=0
    ).single()

    print(node)