In [18]:
from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, XSD
import uuid
import json
from datetime import datetime



In [9]:
# Opening json file
with open('./nytimes_articles.json', 'r') as file:
    nyt_data = json.load(file)

In [24]:
news_desks = {}
for article in nyt_data:
    if article['news_desk'] in news_desks:
      news_desks[article['news_desk']] += 1
    else:
      news_desks[article['news_desk']] = 1
print(news_desks)

{'National': 6135, 'Editorial': 848, 'Science': 3677, 'Games': 2045, 'Summary': 1508, 'Foreign': 15370, 'Express': 4760, 'Washington': 9199, 'Culture': 9930, 'Business': 11966, 'Magazine': 1589, 'Well': 1522, 'Travel': 955, 'Metropolitan': 888, 'Arts&Leisure': 1576, 'Photo': 10, 'Politics': 5385, 'RealEstate': 2332, 'SundayBusiness': 1057, 'Sports': 8252, 'Weekend': 3875, 'Dining': 3488, 'OpEd': 10181, 'Letters': 1585, 'Styles': 4651, 'Parenting': 879, 'Metro': 5294, 'Obits': 3883, 'BookReview': 4027, 'NYTNow': 3961, 'Climate': 1228, 'Smarter Living': 195, 'Learning': 2723, 'Insider': 184, 'Podcasts': 1323, 'TStyle': 1242, 'Upshot': 543, 'Books': 530, 'NewsDesk': 244, 'Society': 425, 'Investigative': 271, 'Video': 46, 'Universal': 26, 'Gender': 153, 'SpecialSections': 697, 'NYTI': 8, 'Live': 40, 'Neediest': 59, 'AtHome': 520, 'Graphics': 8, 'Test': 3, 'Español': 26, 'DigitalNewsDesign': 1, 'InteractiveNews': 1, 'SundayReview': 1, 'Watching': 1, 'Headway': 13, 'Chinese': 1}


In [28]:
news_desk_filter = {'National', 'Editorial', 'Washington','Politics', 'Climate' }
total_articles = 0
for desk in news_desks.keys():
    if desk in news_desk_filter:
        total_articles += news_desks[desk]
print(total_articles)

22795


In [32]:
def is_unicode_string(s):
    try:
        s.encode('utf-8') 
        return True
    except (UnicodeEncodeError, AttributeError):
        return False

In [33]:
# bulk uploading articles
news_desk_filter = {'National', 'Editorial', 'Washington','Politics', 'Climate' }
ontology_path = "./../PoliticalJournalism-individuals-real.rdf"
g = Graph()
g.parse(ontology_path, format="xml")

POLITICAL_JOURNALISM = Namespace("https://tw.rpi.edu/ontology-engineering/oe2024/political-journalism/PoliticalJournalism#")
COMMONS_DATES_TIMES = Namespace("https://www.omg.org/spec/Commons/DatesAndTimes/")

g.bind("PoliticalJournalism", POLITICAL_JOURNALISM)
g.bind("commonsDatesTimes", COMMONS_DATES_TIMES)



author_cache = {}
date_cache = {}
# Add individuals to the graph
for article in nyt_data:
    if article["abstract"] == "":
        continue
    if not is_unicode_string(article["abstract"]):
        print(article["abstract"])
        continue
    if article['news_desk'] not in news_desk_filter:
        continue
    article_label = f"article-{uuid.uuid4()}"
    individual_uri = URIRef(f"{POLITICAL_JOURNALISM}{article_label}")
    g.add((individual_uri, RDF.type, POLITICAL_JOURNALISM.Article))
    g.add((individual_uri, RDFS.label, Literal(article_label)))
    g.add((individual_uri, RDFS.comment, Literal(article["abstract"])))
    
    g.add((individual_uri, POLITICAL_JOURNALISM.hasPublisher, POLITICAL_JOURNALISM.NewYorkTimes))
    #g.add((individual_uri, POLITICAL_JOURNALISM.web_url, URIRef(article["web_url"])))
    parsed_date = datetime.strptime(article["pub_date"], "%Y-%m-%dT%H:%M:%S%z")
    formatted_date = parsed_date.strftime("%Y-%m-%d")
    if formatted_date in date_cache:
        date_uri = date_cache[formatted_date]
    else:
        date_label = f"date-{formatted_date}"
        date_uri = URIRef(f"{POLITICAL_JOURNALISM}{date_label}")
        
        # Check if the date individual already exists in the graph
        if (date_uri, RDF.type, COMMONS_DATES_TIMES.ExplicitDate) not in g:
            g.add((date_uri, RDF.type, POLITICAL_JOURNALISM.Date))
            g.add((date_uri, RDFS.label, Literal(formatted_date)))
            g.add((date_uri, POLITICAL_JOURNALISM.year, Literal(parsed_date.year)))
            g.add((date_uri, POLITICAL_JOURNALISM.month, Literal(parsed_date.month)))
            g.add((date_uri, POLITICAL_JOURNALISM.day, Literal(parsed_date.day)))
        
        # Cache the date individual for reuse
        date_cache[formatted_date] = date_uri
    # Link the Date individual to the Article individual
    g.add((individual_uri, POLITICAL_JOURNALISM.hasPublishDate, date_uri))

    for author in article["authors"]:
        author_name = author.strip()
        if author_name in author_cache:
            author_uri = author_cache[author_name]
        else:
            author_label = f"author-{uuid.uuid4()}"
            author_uri = URIRef(f"{POLITICAL_JOURNALISM}{author_label}")
            
            # Create Author individual
            g.add((author_uri, RDF.type, POLITICAL_JOURNALISM.Author))
            g.add((author_uri, RDFS.label, Literal(author_label)))
            g.add((author_uri, RDFS.comment, Literal(author)))
            # Cache the author individual for reuse
            author_cache[author_name] = author_uri

        # Link the Author to the Article
        g.add((individual_uri, POLITICAL_JOURNALISM.hasAuthor, author_uri))

# Save the updated ontology
updated_ontology_path = "../updated_test_ontology1.rdf"
g.serialize(destination=updated_ontology_path, format="xml")


<Graph identifier=Ned55fa96550f448995bbcf0115b7020a (<class 'rdflib.graph.Graph'>)>