In [1]:

import sqlite3 as sql
import pandas as pd
import numpy as np
import networkx as nx

import itertools

import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
### Define the path to the file (a string in a variable 'db_file')
db_file = '../database/cortona_week.db'

In [3]:
### Create a connection with the database
# Attention: If the database does not exist it will be created
try:
    cn = sql.connect(db_file)
except Exception as e:
    print(e)   

In [4]:
### Create SQL queries

### FOR NODE FILE
### Get person id, name and gender
q_person = """
SELECT
    p.pk_person,
    p.name,
    p.gender,
    p.birth_date
FROM
    Person p
"""

### Get Organization id, name
q_organization = """
SELECT
    o.pk_organization,
    o.name
FROM
    Organization o
"""

### Get occupation id, description
q_occupation = """
SELECT
    o2.pk_occupation,
    o2.description
FROM
    Occupation o2
"""

### Get Subject id, name
q_subject = """
SELECT
    s.pk_subject,
    s.name
FROM
    Subject s
"""

### Get Publication id, name
q_publication = """
SELECT
    p2.pk_publication,
    p2.name
FROM
    Publication p2
"""

### Get event id, name
q_event = """
SELECT
    e.pk_event,
    e.title
FROM
    Event e
"""
### FOR EDGE FILE
### Get direct connections between two persons from the Connection
q_connection = """
SELECT 
	c.fk_person_source,
    c.fk_person_target
FROM
	Connection c  
"""

### Get information of which person participated in which event 
q_participation = """
SELECT 
	p3.fk_person,
    p3.fk_event
FROM
	Participation p3 

"""

### Get information which person published which publication
q_publishing = """
SELECT 
	p4.fk_person,
    p4.fk_publication
FROM
	Publishing p4  
"""


### Get Information which person worked at which organization, at what occupation
q_pursuit = """
SELECT 
	p5.fk_person,
    p5.fk_organization,
    p5.fk_occupation
FROM
	Pursuit p5
"""

### Get Information which person presented which workshop
q_presentation = """
SELECT 
	p6.fk_person,
    p6.fk_workshop
FROM
	Presentation p6   
"""

### Get information which person studied at which organization and what subject
q_study = """
SELECT 
	s.fk_person,
    s.fk_organization,
    s.fk_subject
FROM
	Study s  
"""

In [5]:
### Creating the query result container
cur = cn.cursor()
cur

<sqlite3.Cursor at 0x749e18a232c0>

In [6]:
### Execute the query and retrieve the data

### Node queries
cur.execute(q_person)
data_person = cur.fetchall()

cur.execute(q_organization)
data_organization= cur.fetchall()

cur.execute(q_occupation)
data_occupation = cur.fetchall()

cur.execute(q_subject)
data_subject = cur.fetchall()

cur.execute(q_publication)
data_publication = cur.fetchall()

cur.execute(q_event)
data_event = cur.fetchall()

### EDGE queries
cur.execute(q_connection)
data_connection=cur.fetchall()

cur.execute(q_participation)
data_participation = cur.fetchall()

cur.execute(q_publishing)
data_publishing = cur.fetchall()

cur.execute(q_pursuit)
data_pursuit= cur.fetchall()

cur.execute(q_presentation)
data_presentation = cur.fetchall()

cur.execute(q_study)
data_study = cur.fetchall()


In [7]:
### create node file
###because there is an overlap in IDs between the different attributes of nodes
###(person, organization, occupation, subject, publication), the IDs will be made unique by adding 
### a amount typical of each attribute:
### person: + 0 (as is)
### organization: +2000
### occupation: +3000
### subject: +4000
### publication: +5000
### event: +6000

#create panda data files
### each data file needs to have the following columns: id, name, gender, attribute, birth_date, at_cortona

###person
panda_data_person=pd.DataFrame(data_person, columns=['id',  'name', 'gender','birth_date'])
panda_data_person['attribute']='person'
panda_data_person['at_cortona']='no'
panda_data_person['cortona']='no'

## find out which person was at cortona
panda_data_participation=pd.DataFrame(data_participation, columns=['fk_person','fk_event'])
## events 1 to 30 are Cortona
cortona_events=range(1,30)
mask = panda_data_participation['fk_event'].isin(cortona_events) 
#take all the person_id that were at cortona at least once (unique)
persons_at_cortona=pd.unique(panda_data_participation[mask]['fk_person'])

mask_at_cortona=panda_data_person['id'].isin(persons_at_cortona)
panda_data_person.loc[mask_at_cortona, 'at_cortona']='yes'

###organization
panda_data_organization = pd.DataFrame(data_organization, columns=['id','name'])
#add 2000 to id
panda_data_organization['id']=panda_data_organization['id']+2000
panda_data_organization['attribute'] = 'organization'
panda_data_organization['gender'] = None
panda_data_organization['birth_date'] = None
panda_data_organization['gender'] = None
panda_data_organization['at_cortona'] = None
panda_data_organization['cortona'] = None

###occupation
panda_data_occupation = pd.DataFrame(data_occupation, columns=['id','name'])
#add 3000 to id
panda_data_occupation['id']=panda_data_occupation['id'] + 3000
panda_data_occupation['attribute'] = 'occupation'
panda_data_occupation['gender'] = None
panda_data_occupation['birth_date'] = None
panda_data_occupation['gender'] = None
panda_data_occupation['at_cortona'] = None
panda_data_occupation['cortona'] = None

###subject
panda_data_subject = pd.DataFrame(data_subject, columns=['id','name'])
#add 4000 to id
panda_data_subject['id']=panda_data_subject['id'] + 4000
panda_data_subject['attribute'] = 'subject'
panda_data_subject['gender'] = None
panda_data_subject['birth_date'] = None
panda_data_subject['gender'] = None
panda_data_subject['at_cortona'] = None
panda_data_subject['cortona'] = None

###publication
panda_data_publication = pd.DataFrame(data_publication, columns=['id','name'])
#add 5000 to id
panda_data_publication['id']=panda_data_publication['id'] + 5000
panda_data_publication['attribute'] = 'publication'
panda_data_publication['gender'] = None
panda_data_publication['birth_date'] = None
panda_data_publication['gender'] = None
panda_data_publication['at_cortona'] = None
panda_data_publication['cortona'] = None

###event
panda_data_event = pd.DataFrame(data_event, columns=['id','name'])
cortona_events=range(1,9)
mask = panda_data_event['id'].isin(cortona_events)

#add 6000 to id
panda_data_event['id']=panda_data_event['id'] + 6000
panda_data_event['attribute'] = 'event'
panda_data_event['gender'] = None
panda_data_event['birth_date'] = None
panda_data_event['gender'] = None
panda_data_event['at_cortona'] = None
panda_data_event['cortona'] = None
panda_data_event.loc[mask, 'cortona'] = 'yes'

print(panda_data_event)
                                

panda_full_nodes = pd.concat([panda_data_person,panda_data_organization,panda_data_occupation,
                             panda_data_subject,panda_data_publication,panda_data_event])
panda_full_nodes['[z]']=2
panda_full_nodes.to_csv('complete_node_file.csv',index=False)

      id                                               name attribute gender  \
0   6001                                          Cortona 1     event   None   
1   6002                                          Cortona 2     event   None   
2   6003                                         Perception     event   None   
3   6004                                   Utopia & Science     event   None   
4   6005                                      Metamorphosis     event   None   
5   6006                                   Borders & Limits     event   None   
6   6007                                   Mythos & Science     event   None   
7   6008                                      The Many Ways     event   None   
8   6009                                    Inside- Outside     event   None   
9   6010                    Becoming / Being / Passing Away     event   None   
10  6011                                     Future Visions     event   None   
11  6012                                

In [8]:
### create edge files
### every dataframe needs to have following columns: source, target, attribute


#extract the direct connections from "connection", these code for edges between persons
panda_data_connections=pd.DataFrame(data_connection, columns=['source','target'])
panda_data_connections['attribute']='personal'

#extract the connections from "publishing", these code for edges between a person (source) and a publication (target)
#for the publication id, 5000 has to added

panda_data_publishing=pd.DataFrame(data_publishing, columns = ['source', 'target'])
panda_data_publishing['target']=panda_data_publishing['target']+5000
panda_data_publishing['attribute']='publishing'

#extract the connections from "pursuit", these code for edges between persons (source) and organizations +2000 (target), 
#as well as between persons (source) and occupations + 3000 (target)
panda_data_pursuit = pd.DataFrame(data_pursuit, columns = ['person','organization','occupation'])

panda_data_pursuit_organization= pd.DataFrame(panda_data_pursuit, columns = ['person','organization'])
panda_data_pursuit_organization.rename(columns={'person': 'source', 'organization': 'target'},inplace=True)
panda_data_pursuit_organization['target'] = panda_data_pursuit['organization'] +2000
panda_data_pursuit_organization['attribute'] = 'pursuit_organization'

panda_data_pursuit_occupation= pd.DataFrame(panda_data_pursuit, columns = ['person','occupation'])
panda_data_pursuit_occupation.rename(columns={'person': 'source', 'occupation': 'target'},inplace=True)
panda_data_pursuit_occupation['target'] = panda_data_pursuit['occupation'] + 3000
panda_data_pursuit_occupation['attribute'] = 'pursuit_occupation'

#extract the connections from "study", these code for edges between persons (Source) and organizations +2000,
#and persons (source) and subject +4000 (target)

panda_data_study = pd.DataFrame(data_study, columns = ['person','organization','subject'])

panda_data_study_organization= pd.DataFrame(panda_data_study, columns = ['person','organization'])
panda_data_study_organization.rename(columns={'person': 'source', 'organization': 'target'},inplace=True)
panda_data_study_organization['target'] = panda_data_study['organization'] +2000
panda_data_study_organization['attribute'] = 'study_organization'

panda_data_study_subject= pd.DataFrame(panda_data_study, columns = ['person','subject'])
panda_data_study_subject.rename(columns={'person': 'source', 'subject': 'target'},inplace=True)
panda_data_study_subject['target'] = panda_data_study['subject'] + 4000
panda_data_study_subject['attribute'] = 'subject'

###extract the connections from "participation", these code for edges between persons (Source) and events +6000 (target)
panda_data_participation.rename(columns={'fk_person': 'source', 'fk_event': 'target'},inplace=True)
panda_data_participation['target'] = panda_data_participation['target'] +6000
panda_data_participation['attribute'] = 'event_participation'


panda_full_edges = pd.concat([panda_data_connections,panda_data_publishing, panda_data_pursuit_organization,
                             panda_data_pursuit_occupation,panda_data_study_organization, panda_data_study_subject, panda_data_participation])

panda_full_edges_no_nan = panda_full_edges.dropna()
panda_full_edges_no_nan['target']=panda_full_edges_no_nan['target'].astype('int')
panda_full_edges_no_nan['source']=panda_full_edges_no_nan['source'].astype('int')

#mask_nan_target=panda_full_edges['target'].notna()
#mask_nan_source=panda_full_edges['source'].notna()

#panda_full_edges_no_nan=pd.DataFrame(panda_full_edges.loc[mask_nan_source,'source'].astype('int'),columns=['source'])
#panda_full_edges_no_nan['target']=panda_full_edges.loc[mask_nan_target,'target'].astype('int')
#panda_full_edges_no_nan['attribute']=panda_full_edges.loc[mask_nan,'attribute']
panda_full_edges_no_nan.to_csv('complete_edge_file.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  panda_full_edges_no_nan['target']=panda_full_edges_no_nan['target'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  panda_full_edges_no_nan['source']=panda_full_edges_no_nan['source'].astype('int')


In [9]:

print(panda_full_edges_no_nan)


     source  target            attribute
0       198     201             personal
1       203      74             personal
2        87      81             personal
3        51      34             personal
4        16      90             personal
..      ...     ...                  ...
602      78    6051  event_participation
603     265    6051  event_participation
604      82    6052  event_participation
605      74    6052  event_participation
606      47    6052  event_participation

[1835 rows x 3 columns]


In [10]:
mask_nan=panda_full_edges['target'].notna()

panda_full_edges_no_nan=pd.DataFrame(panda_full_edges.loc[mask_nan,'source'].asattribute('int'),columns=['source'])
panda_full_edges_no_nan['target']=panda_full_edges.loc[mask_nan,'target'].asattribute('int')
panda_full_edges_no_nan['attribute']=panda_full_edges.loc[mask_nan,'attribute']

AttributeError: 'Series' object has no attribute 'asattribute'