In [117]:
import prince

import sqlite3 as sql
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
import prince
from sklearn.preprocessing import OneHotEncoder


In [118]:
### Define the path to the file (a string in a variable 'db_file')
db_file = '../database/cortona_week.db'

In [119]:
### Create a connection to the database
# If the database does not exist it will be created
try:
    cn = sql.connect(db_file)

except Exception as e:
    print(e)


  


In [120]:
### SQL Queries

#only information from persons are aggregated that have attended the cortona events in question
# (events 1-8 until year 1995)

#This query extracts information from the tabel Person if pk_person is found
#in EITHER the table Study OR the table Pursuit
q_persons = """
SELECT
    p.pk_person,
    p.name,
    p.gender,
    p.birth_date,
    p.nationality
FROM
    Person p
WHERE
    p.pk_person IN (
        SELECT s.fk_person
            FROM Study s
        UNION
        SELECT p2.fk_person
            FROM Pursuit p2
    )
AND
    p.pk_person IN (
        SELECT 
            p4.fk_person
        FROM
            Participation p4
        WHERE
            p4.fk_event < 9
    )
"""
q_all = """
SELECT 
	p2.fk_person,
    o.meta_category,
    NULL,
    o2.meta_category,
    p3.metacategoryCountry
FROM
	Pursuit p2
FULL JOIN
    Organization o
ON
    p2.fk_organization = o.pk_organization

FULL JOIN
    Occupation o2
ON
    p2.fk_occupation = o2.pk_occupation
    
FULL JOIN
    Place p3
ON
    o.fk_place = p3.pk_place

WHERE
    p2.fk_person IN (
        SELECT 
            p4.fk_person
        FROM
            Participation p4
        WHERE
            p4.fk_event < 9
    )

    
UNION
SELECT
    s.fk_person,
    o.meta_category,
    s2.meta_category,
    NULL,
    p3.metacategoryCountry
FROM
    Study s
JOIN
    Subject s2
ON
    s.fk_subject = s2.pk_subject
JOIN
    Organization o
ON
    s.fk_organization = o.pk_organization
JOIN
    Place p3
ON
    o.fk_place = p3.pk_place
WHERE
     s.fk_person IN (
        SELECT 
            p4.fk_person
        FROM
            Participation p4
        WHERE
            p4.fk_event < 12
    )


"""


q_pursuit = """
SELECT 
	p2.fk_person,
    o.meta_category,
    o2.meta_category,
    p3.country
FROM
	Pursuit p2
FULL JOIN
    Organization o
ON
    p2.fk_organization = o.pk_organization

FULL JOIN
    Occupation o2
ON
    p2.fk_occupation = o2.pk_occupation
FULL JOIN
    Place p3
ON
    o.fk_place = p3.pk_place
WHERE
     p2.fk_person IN (
        SELECT 
            p4.fk_person
        FROM
            Participation p4
        WHERE
            p4.fk_event < 9
    )
"""

q_study = """
SELECT
    s.fk_person,
    o.meta_category,
    s2.meta_category,
    p3.country
FROM
    Study s
JOIN
    Subject s2
ON
    s.fk_subject = s2.pk_subject
JOIN
    Organization o
ON
    s.fk_organization = o.pk_organization
JOIN
    Place p3
ON
    o.fk_place = p3.pk_place
WHERE
     s.fk_person IN (
        SELECT 
            p4.fk_person
        FROM
            Participation p4
        WHERE
            p4.fk_event < 12
    )
"""


In [121]:
### Creates container for results
cur = cn.cursor()
cur

<sqlite3.Cursor at 0x78d3eaeb26c0>

In [122]:
### Execute the SQL queries and save the results
cur.execute(q_persons)
data_persons = cur.fetchall()

cur.execute(q_pursuit)
data_pursuit= cur.fetchall()

cur.execute(q_study)
data_study = cur.fetchall()

cur.execute(q_all)
data_all= cur.fetchall()


In [123]:
pd_persons = pd.DataFrame(data_persons, columns = ['pkPerson', 'name','gender','birthDate','nationality'])

pd_pursuit = pd.DataFrame(data_pursuit, columns = ['pkPerson', 'metacategoryOrganization' , 'metacategoryOccupation', 'countryPursuit'])

pd_study = pd.DataFrame (data_study, columns = ['pkPerson','studyMetaCategory','organizationMetacategory','countryStudy'])

pd_pursuit_study = pd.DataFrame(data_all, columns = ['pkPerson', 'metacategoryOrganizationPursuit' ,
                                                     'studyMetaCategory', 'occupationMetacategory','countryPursuitStudy'])



pd_persons=pd_persons.sort_values(by=['pkPerson'])
pd_pursuit=pd_pursuit.sort_values(by=['pkPerson'])
pd_study=pd_study.sort_values(by=['pkPerson'])
pd.set_option('display.max_rows', 500)

display(pd_pursuit_study)
#display(pd_study)


all_person= pd.concat([pd_pursuit['pkPerson'],pd_study['pkPerson']])
all_person=all_person.sort_values()
pd.set_option('display.max_rows', 500)
#display(all_person)

#print(pd_pursuit['pkPerson'].nunique())
#print(pd_study['pkPerson'].nunique())

#print(pd_persons['pkPerson'])
all_person_unique=all_person.unique()

max_persons= pd_persons['pkPerson'].nunique()

not_in_mask= ~pd_persons['pkPerson'].isin(all_person)
#print(not_in_mask)
#print(pd_persons['pkPerson'][not_in_mask])
#print(all_person.nunique())
#print(pd_persons['pkPerson'].nunique())

pd.set_option("display.max_rows", 10)
#display(pd_persons)
#display(pd_pursuit)
#display(pd_study)


Unnamed: 0,pkPerson,metacategoryOrganizationPursuit,studyMetaCategory,occupationMetacategory,countryPursuitStudy
0,1,Universität,,Universitätsposten,Europa
1,1,Universität,Geisteswissenschaften,,Europa
2,3,,,Therapeut:in,
3,3,psychologisches Institut,Psychotherapie,,Europa
4,5,Universität,,Universitätsposten,Amerika
5,5,Universität,,Universitätsposten,Europa
6,5,Universität,Geisteswissenschaften,,Europa
7,7,,,Therapeut:in,
8,13,,,Künstler:in,
9,16,,,Künstler:in,


In [130]:
#one-hot encoding
pd_all_to_encode=pd_pursuit_study[['pkPerson','countryPursuitStudy','occupationMetacategory']]
pd_all_hot_encode=pd.get_dummies(pd_all_to_encode,dummy_na=True)
pd.set_option('display.max_rows', 500)

#display(pd_all_hot_encode)

In [131]:
grouped_pd_all_hot_encode=pd_all_hot_encode.groupby('pkPerson').sum().reset_index()


#

pd_person_to_encode = pd_persons[['pkPerson','gender']]

pd_person_hot_encode=pd.get_dummies(pd_person_to_encode,dummy_na=False)



output=grouped_pd_all_hot_encode.merge(pd_person_hot_encode, left_on='pkPerson', right_on='pkPerson', how = 'inner')
#display(output)
output = output.loc[:, output.columns != 'pkPerson']
output_full = output.replace({True: 1, False: 0})



mca_no_one_hot = prince.MCA(n_components=8,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='sklearn',
    random_state=42,
    one_hot=False)
mca_no_one_hot = mca_no_one_hot.fit(output_full)
mca_no_one_hot.eigenvalues_summary

mca_no_one_hot.plot(
    output_full,
    x_component=0,
    y_component=1,
    show_column_markers=True,
    show_row_markers=True,
    show_column_labels=False,
    show_row_labels=False
)
#mca_no_one_hot.eigenvalues_summary


  output_full = output.replace({True: 1, False: 0})


In [87]:
#one-hot encoding

#leave birth_date out for now
pd_person_to_encode = pd_persons[['pkPerson','gender', 'nationality']]

pd_person_hot_encode=pd.get_dummies(pd_person_to_encode,dummy_na=False)

pd_study_hot_encode=pd.get_dummies(pd_study,dummy_na=False)
#display(pd_study_hot_encode)

grouped_pd_study_hot_encode=pd_study_hot_encode.groupby('pkPerson').sum().reset_index()
#display(grouped_pd_study_hot_encode)

pd_pursuit_hot_encode=pd.get_dummies(pd_pursuit,dummy_na=False)
#display(pd_pursuit_hot_encode)

grouped_pd_pursuit_hot_encode=pd_pursuit_hot_encode.groupby('pkPerson').sum().reset_index()
#display(grouped_pd_pursuit_hot_encode)





output=pd_pursuit_hot_encode.merge(grouped_pd_study_hot_encode, left_on='pkPerson', right_on='pkPerson', how = 'inner')
pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 35)
#display(output)

#grouped_output=output.groupby('pkPerson').sum().reset_index()
#display(grouped_output)
#output_full=output.merge(pd_person_hot_encode, left_on='pkPerson', right_on='pkPerson', how='outer')
#display(output_full)
#output_full_encoded=pd.get_dummies(output_full,dummy_na=True)
#display(output_full_encoded)

output=output.fillna(0)
output = output.replace({True: 1, False: 0})
output=output.groupby('pkPerson').sum().reset_index()


output_full=output.merge(pd_person_hot_encode, left_on='pkPerson', right_on='pkPerson', how='inner')
output_full=output_full.fillna(0)
output_full = output_full.replace({True: 1, False: 0})
output_full=output_full.groupby('pkPerson').sum().reset_index()
output_full=output_full.astype('float')

pd.set_option("display.max_columns", 5)
pd.set_option("display.max_rows", 5)

is_infinite = np.isinf(output_full.values)

# Check if any infinity values exist
res = is_infinite.any()
print(res)
is_nan = np.isnan(output_full.values)

# Check if any nan values exist
res = is_nan.any()
print(res)


output_full[output_full > 1] = 1
pd.set_option("display.max_columns", 5)
pd.set_option("display.max_rows", 5)
#display(output_full)
#output_full = output_full.replace({1: True, 0: False})
output_full = output_full.loc[:, output_full.columns != 'pkPerson']
#display(output_full)
output_full.to_csv('output_for_MCA.csv',index=True) 


print(output_full.columns[output_full.sum(axis=0)==0])
print(np.sum(output_full.sum(axis=1)==0)) 

print(np.sum(output_full.sum(axis=0)==0)) 

output_full_dropped = output_full.drop(columns=output_full.columns[output_full.sum(axis=0)==0])


mca_no_one_hot = prince.MCA(one_hot=False)
mca_no_one_hot = mca_no_one_hot.fit(output_full_dropped)


mca_no_one_hot.plot(
    output_full_dropped,
    x_component=0,
    y_component=1,
    show_column_markers=True,
    show_row_markers=True,
    show_column_labels=False,
    show_row_labels=False
)
#test_study=pd.get_dummies(pd_study,dummy_na=True)
#display(test_study)


  output = output.replace({True: 1, False: 0})
  output_full = output_full.replace({True: 1, False: 0})


False
False
Index(['metacategoryOrganization_künstlerischer Ort',
       'metacategoryOccupation_Public official',
       'metacategoryOccupation_redaktioneller Beruf', 'countryPursuit_Indien',
       'countryPursuit_Japan', 'countryPursuit_Slowenien',
       'organizationMetacategory_künstlerischer Ort', 'nationality_Frankreich',
       'nationality_Indien', 'nationality_Irland', 'nationality_Kuba',
       'nationality_Luxemburg', 'nationality_Palästina',
       'nationality_Slowenien', 'nationality_Tibet'],
      dtype='object')
0
15
