## Populate NBAO data

Create RDF dataset for NBAO

In [1]:
import pandas as pd
import os
from pathlib import Path
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import FOAF, XSD, OWL
import datetime
import re

In [2]:
# Dataset paths
path = str(Path(os.path.abspath(os.getcwd())).absolute())
firstDS = path + "/data/first/"
secondDS = path + "/data/second/"
thirdDS = path + "/data/third/"

teamPath = firstDS + "team.csv"
teamDetailsPath = firstDS + "team_details.csv"
commonPlayerInfoPath = firstDS + "common_player_info.csv"
gamePath = firstDS + "game.csv"
officialsPath = firstDS + "officials.csv"

basketballMasterPath = secondDS + "basketball_master.csv"
basketballAwardsPlayersPath = secondDS + "basketball_awards_players.csv"
basketballCoachesPath = secondDS + "basketball_coaches.csv"
basketballAwardsCoachesPath = secondDS + "basketball_awards_coaches.csv"
basketballPlayerAllstarPath = secondDS + "basketball_player_allstar.csv"

injuryPath = thirdDS + "injury.csv"

# Output path
savePath =  path + "/rdf/"

In [3]:
# Define NBAO
NBAO = Namespace("http://www.semanticweb.org/~gdb/ontology/spire/nbao#")

positionList = ["Center", "Forward", "Center-Forward", "Forward-Center", "Guard", "Forward-Guard", "Guard-Forward"]

leagueList = ["ABA", "NBA"]

## Team

In [4]:
%%time

# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

team = pd.read_csv(teamPath, sep=',', index_col='id')
teamDetails = pd.read_csv(teamDetailsPath, sep=',', index_col='team_id')
# team.info()
teamDetails.info()

for index, row in teamDetails.iterrows():    
    Team = URIRef(NBAO["team_" + str(index)])
    
    g.add((Team, RDF.type, NBAO.Team))
    g.add((Team, NBAO['teamNickname'], Literal(row['nickname'], datatype=XSD.string)))
    g.add((Team, NBAO['teamAbbreviation'], Literal(row['abbreviation'], datatype=XSD.string)))
    g.add((Team, NBAO['teamCity'], Literal(row['city'], datatype=XSD.string)))
    g.add((Team, NBAO['yearFounded'], Literal(int(row['yearfounded']), datatype=XSD.gYear)))
    g.add((Team, NBAO['teamArena'], Literal(row['arena'], datatype=XSD.string)))

    g.add((Team, NBAO['teamName'], Literal(team.loc[index, 'full_name'], datatype=XSD.string)))
    g.add((Team, NBAO['teamState'], Literal(team.loc[index, 'state'], datatype=XSD.string)))


# Write all the data in the Turtle format into file
with open(savePath + 'team.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, 1610612737 to 1610612766
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   abbreviation        25 non-null     object 
 1   nickname            25 non-null     object 
 2   yearfounded         25 non-null     float64
 3   city                25 non-null     object 
 4   arena               25 non-null     object 
 5   arenacapacity       16 non-null     float64
 6   owner               25 non-null     object 
 7   generalmanager      25 non-null     object 
 8   headcoach           24 non-null     object 
 9   dleagueaffiliation  25 non-null     object 
 10  facebook            25 non-null     object 
 11  instagram           25 non-null     object 
 12  twitter             25 non-null     object 
dtypes: float64(2), object(11)
memory usage: 2.7+ KB
CPU times: total: 15.6 ms
Wall time: 60 ms


## Player - Draft

In [5]:
%%time

# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

commonPlayerInfo = pd.read_csv(commonPlayerInfoPath, sep=',', index_col='person_id')
# commonPlayerInfo.info()

for index, row in commonPlayerInfo.iterrows():
    Player = URIRef(NBAO["player_" + str(index)])

    g.add((Player, RDF.type, NBAO.Player))
    g.add((Player, FOAF['name'], Literal(row['display_first_last'], datatype=XSD.string)))

    # Handle empty cell
    if pd.notnull(row['from_year']):
        g.add((Player, NBAO['careerStartYear'], Literal(int(row['from_year']), datatype=XSD.gYear)))
    if pd.notnull(row['to_year']):
        g.add((Player, NBAO['careerEndYear'], Literal(int(row['to_year']), datatype=XSD.gYear)))

    # Several jersey numbers
    if pd.notnull(row['jersey']):
        jerseyNums = re.split(' |-', row['jersey'])
        for jerseyNum in jerseyNums:
            if jerseyNum != '':
                g.add((Player, NBAO['jerseyNum'], Literal(int(jerseyNum), datatype=XSD.int)))

    # Parse date
    g.add((Player, NBAO['birthDate'], Literal(datetime.datetime.strptime(row['birthdate'], '%Y-%m-%d %H:%M:%S').date(), datatype=XSD.date)))

    # Positions
    if pd.notnull(row['position']):
        if row['position'] in positionList:
            Position = URIRef(NBAO[row['position']])
            g.add((Player, NBAO['hasPosition'], Position))

    # Main team
    if row['team_id'] in teamDetails.index:
        Team = URIRef(NBAO["team_" + str(row['team_id'])])
        g.add((Player, NBAO['hasMainTeam'], Team))

    # Player - Draft
    if row['draft_year'] != "Undrafted" and row['draft_year'] != "":
        # Draft
        Draft = URIRef(NBAO["draft_" + row['draft_year']])
        g.add((Draft, RDF.type, NBAO.Draft))
        
        g.add((Draft, NBAO["draftYear"], Literal(int(row['draft_year']), datatype=XSD.gYear)))

        if pd.notnull(row['draft_round']) and row['draft_round'] != "Undrafted" and pd.notnull(row['draft_number']) and row['draft_number'] != "Undrafted":
            # Draftranks - Draft
            DraftRanks = URIRef(NBAO["draftranks_" + "year" + row['draft_year'] + "_round" + row['draft_round'] + "_number" + row['draft_number']])
            g.add((DraftRanks, RDF.type, NBAO.DraftRanks))
            
            g.add((DraftRanks, NBAO['draftRoundNumber'], Literal(row['draft_round'], datatype=XSD.int)))
            g.add((DraftRanks, NBAO['draftOverallPick'], Literal(row['draft_number'], datatype=XSD.int)))
            g.add((DraftRanks, NBAO['ofDraft'], Draft))

            # Player - Draftranks
            g.add((Player, NBAO['isDraftedAs'], DraftRanks))


# Write all the data in the Turtle format into file
with open(savePath + 'player-draft.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

CPU times: total: 3.41 s
Wall time: 3.58 s


## InjuryStat - Team - Player

In [6]:
%%time

# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

injury = pd.read_csv(injuryPath, sep=',', index_col=0)
# injury.info()

for index, row in injury.iterrows():
    # Check if team exists
    if (teamDetails["nickname"] == row["Team"]).any() == True :
        InjuryStat = URIRef(NBAO["injurystat_" + str(index)])

        hasMatchedPlayers = False
        
        # InjuryStat - Player
        acquiredPlayers = re.split('/', str(row['Acquired']))
        for acquiredPlayer in acquiredPlayers:
            acquiredPlayer = acquiredPlayer.strip()
            if (commonPlayerInfo["display_first_last"] == acquiredPlayer).any() == True :
                hasMatchedPlayers = True
                Player = URIRef(NBAO["player_" + str(commonPlayerInfo[commonPlayerInfo["display_first_last"] == acquiredPlayer].index[0])])
                g.add((InjuryStat, NBAO['withAcquiredPlayer'], Player))
        
        relinquishedPlayers = re.split('/', str(row['Relinquished']))
        for relinquishedPlayer in acquiredPlayers:
            relinquishedPlayer = relinquishedPlayer.strip()
            if (commonPlayerInfo["display_first_last"] == relinquishedPlayer).any() == True :
                hasMatchedPlayers = True
                Player = URIRef(NBAO["player_" + str(commonPlayerInfo[commonPlayerInfo["display_first_last"] == relinquishedPlayer].index[0])])
                g.add((InjuryStat, NBAO['withRelinquishedPlayer'], Player))

        if hasMatchedPlayers:
            g.add((InjuryStat, RDF.type, NBAO.InjuryStat))
            
            g.add((InjuryStat, NBAO['dateOfOccurence'], Literal(datetime.datetime.strptime(row['Date'], '%Y-%m-%d').date(), datatype=XSD.date)))
            g.add((InjuryStat, NBAO['injuryNote'], Literal(row['Notes'], datatype=XSD.string)))

            # InjuryStat - Team
            Team = URIRef(NBAO["team_" + str(teamDetails[teamDetails["nickname"] == row["Team"]].index[0])])
            g.add((InjuryStat, NBAO['occurredInTeam'], Team))
                
# Write all the data in the Turtle format into file
with open(savePath + 'injurystat-team-player.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

CPU times: total: 42.4 s
Wall time: 45.1 s


## Player - AllStarGame

In [7]:
%%time

# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

basketballPlayerAllstar = pd.read_csv(basketballPlayerAllstarPath, sep=',', index_col=False)
# basketballPlayerAllstar.info()

for _, row in basketballPlayerAllstar.iterrows():
    AllStarGame = URIRef(NBAO["allstargame_" + "season" + str(row['season_id'])])
    g.add((AllStarGame, RDF.type, NBAO.AllStarGame))
    
    g.add((AllStarGame, NBAO['ofSeason'], Literal(int(row['season_id']), datatype=XSD.gYear)))
    
    if row['league_id'] in leagueList:
        League = URIRef(NBAO[row['league_id']])
        g.add((AllStarGame, NBAO['ofLeague'], League))

    participatedPlayerName = str(row['first_name']) + " " + str(row['last_name'])
    if (commonPlayerInfo["display_first_last"] == participatedPlayerName).any() == True :
        Player = URIRef(NBAO["player_" + str(commonPlayerInfo[commonPlayerInfo["display_first_last"] == participatedPlayerName].index[0])])
        g.add((Player, NBAO['participatedInAllStarGame'], AllStarGame))

# Write all the data in the Turtle format into file
with open(savePath + 'player-allstargame.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

CPU times: total: 1.33 s
Wall time: 1.35 s


## Coach - CoachStat - CoachAward

In [8]:
%%time

# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

basketballMaster = pd.read_csv(basketballMasterPath, sep=',', index_col='bioID')
basketballCoaches = pd.read_csv(basketballCoachesPath, sep=',', index_col=False)
basketballAwardsCoaches = pd.read_csv(basketballAwardsCoachesPath, sep=',', index_col=False)

# basketballMaster.info()
# basketballCoaches.info()
# basketballAwardsCoaches.info()


for index, row in basketballCoaches.iterrows(): 
    # Consider only ABA and NBA and existed teams
    if (row['lgID'] in leagueList) and ((teamDetails["abbreviation"] == row["tmID"]).any() == True):
        coachID = row['coachID']

        # Validate the Coach
        if (coachID in basketballMaster.index) and basketballMaster.loc[coachID, 'birthDate'] != "0000-00-00" :
            coachName = basketballMaster.loc[coachID, 'firstName'] + " " + basketballMaster.loc[coachID, 'lastName']
    
            # URI for the Coach
            Coach = URIRef(NBAO["coach_" + str(coachID)])
    
            if (commonPlayerInfo["display_first_last"] == coachName).any() == True :
                # If Coach is also Player, use the URI of the available Player
                Coach = URIRef(NBAO["player_" + str(commonPlayerInfo[commonPlayerInfo["display_first_last"] == coachName].index[0])])
            else:
                # new URI for Coach
                g.add((Coach, FOAF['name'], Literal(coachName, datatype=XSD.string)))
                g.add((Coach, NBAO['birthDate'], Literal(datetime.datetime.strptime(basketballMaster.loc[coachID, 'birthDate'], '%Y-%m-%d').date(), datatype=XSD.date)))
    
            # Add Coach to graph
            g.add((Coach, RDF.type, NBAO.Coach))
    
            # CoachStat
            CoachStat = URIRef(NBAO["coachstat_" + str(index)])
            g.add((CoachStat, RDF.type, NBAO.CoachStat))
            g.add((CoachStat, NBAO['lostNum'], Literal(int(row['lost']), datatype=XSD.int)))
            g.add((CoachStat, NBAO['wonNum'], Literal(int(row['won']), datatype=XSD.int)))
            g.add((CoachStat, NBAO['coachYear'], Literal(int(row['year']), datatype=XSD.gYear)))
            
            Team = URIRef(NBAO["team_" + str(teamDetails[teamDetails["abbreviation"] == row["tmID"]].index[0])])
            g.add((CoachStat, NBAO['withCoachedTeam'], Team))
    
            g.add((Coach, NBAO['hasCoachStat'], CoachStat))


for index, row in basketballAwardsCoaches.iterrows():
    if (row['lgID'] in leagueList):
        coachID = row['coachID']

        # Validate the Coach
        if (coachID in basketballMaster.index) and basketballMaster.loc[coachID, 'birthDate'] != "0000-00-00" :
            coachName = basketballMaster.loc[coachID, 'firstName'] + " " + basketballMaster.loc[coachID, 'lastName']
    
            # URI for the Coach
            Coach = URIRef(NBAO["coach_" + str(coachID)])
    
            if (commonPlayerInfo["display_first_last"] == coachName).any() == True :
                # If Coach is also Player, use the URI of the available Player
                Coach = URIRef(NBAO["player_" + str(commonPlayerInfo[commonPlayerInfo["display_first_last"] == coachName].index[0])])
            else:
                # new URI for Coach
                g.add((Coach, FOAF['name'], Literal(coachName, datatype=XSD.string)))
                g.add((Coach, NBAO['birthDate'], Literal(datetime.datetime.strptime(basketballMaster.loc[coachID, 'birthDate'], '%Y-%m-%d').date(), datatype=XSD.date)))
    
            # Add Coach to graph
            g.add((Coach, RDF.type, NBAO.Coach))
        
            awardTitle = row['award']
            tokens = re.split(' ', awardTitle)
            awardId = "award_" + "year" + str(row['year']) + "_title"
            for token in tokens:
                awardId = awardId + token.strip()

            Award = URIRef(NBAO[awardId])
            g.add((Award, RDF.type, NBAO.Award))
            g.add((Award, NBAO['awardYear'], Literal(int(row['year']), datatype=XSD.gYear)))
            g.add((Award, NBAO['awardTitle'], Literal(awardTitle, datatype=XSD.string)))

            League = URIRef(NBAO[row['lgID']])
            g.add((Award, NBAO['ofLeague'], League))

            g.add((Award, NBAO['isAwardedTo'], Coach))            
            

# Write all the data in the Turtle format into file
with open(savePath + 'coach-coachstat-coachaward.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))



CPU times: total: 1 s
Wall time: 1.03 s


## Player - PlayerAward

In [9]:
%%time

# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)


basketballAwardsPlayers = pd.read_csv(basketballAwardsPlayersPath, sep=',', index_col=False)


for index, row in basketballAwardsPlayers.iterrows():
    if (row['lgID'] in leagueList):
        playerID = row['playerID']

        # Validate the Player
        if (playerID in basketballMaster.index) and basketballMaster.loc[playerID, 'birthDate'] != "0000-00-00" :
            playerName = basketballMaster.loc[playerID, 'firstName'] + " " + basketballMaster.loc[playerID, 'lastName']

            if (commonPlayerInfo["display_first_last"] == playerName).any() == True :
                # If the Player exists in the first dataset
                Player = URIRef(NBAO["player_" + str(commonPlayerInfo[commonPlayerInfo["display_first_last"] == playerName].index[0])])
        
                awardTitle = row['award']
                tokens = re.split(' |-', awardTitle)
                awardId = "award_" + "year" + str(row['year']) + "_title"
                for token in tokens:
                    awardId = awardId + token.strip()
        
                Award = URIRef(NBAO[awardId])
                g.add((Award, RDF.type, NBAO.Award))
                g.add((Award, NBAO['awardYear'], Literal(int(row['year']), datatype=XSD.gYear)))
                g.add((Award, NBAO['awardTitle'], Literal(awardTitle, datatype=XSD.string)))
        
                League = URIRef(NBAO[row['lgID']])
                g.add((Award, NBAO['ofLeague'], League))
        
                g.add((Award, NBAO['isAwardedTo'], Player))

# Write all the data in the Turtle format into file
with open(savePath + 'player-award.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))

CPU times: total: 1.31 s
Wall time: 1.32 s


## Game

In [10]:
# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

games = pd.read_csv(gamePath, sep=',', index_col=False)

# games.info()

for index, row in games.iterrows():
        gameID = row['game_id']
        Game = URIRef(NBAO["game_"+str(gameID)])
        homeTeamIndex = row['team_id_home']
        awayTeamIndex = row['team_id_away']
        pointsHome = int(row['pts_home'])
        pointsAway = int(row['pts_away'])

        g.add((Game,RDF.type,NBAO.Game))
        g.add((Game,NBAO['hasHomeTeam'],NBAO["team_" + str(homeTeamIndex)]))
        g.add((Game,NBAO['hasAwayTeam'],NBAO["team_" + str(awayTeamIndex)]))

        g.add((Game,NBAO['pointsAway'],Literal(pointsAway, datatype=XSD.int)))               
        g.add((Game,NBAO['pointsHome'],Literal(pointsHome, datatype=XSD.int)))               
        
with open(savePath + 'games.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))


## Official

In [11]:
# Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("owl", OWL)
g.bind("nbao", NBAO)

officials = pd.read_csv(officialsPath, sep=',', index_col=False)
# Done for checking if some official had more than 1 jersey num in his career
officials_ids = {}
for index, row in officials.iterrows():
        officialID = row['official_id']
        gameID = row['game_id']
        jerseyNum = row['jersey_num']
        Official = URIRef(NBAO["official_"+str(officialID)])
        if officialID not in officials_ids:
            officials_ids[officialID] = []
            official_first_name = row['first_name']
            official_last_name = row['last_name']
            g.add((Official,RDF.type,NBAO.Official))
            g.add((Official,FOAF['name'],Literal(official_first_name + " " + official_last_name, datatype=XSD.string)))
        if jerseyNum not in officials_ids[officialID]:
                officials_ids[officialID].append(jerseyNum)
                g.add((Official,NBAO['jerseyNum'],Literal(jerseyNum, datatype=XSD.int)))                      
        g.add((Official,NBAO['handledGame'],NBAO['game_'+str(gameID)]))
with open(savePath + 'officials.ttl', 'w') as file:
    file.write(g.serialize(format='turtle'))
