In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import prince
import sqlite3 as sql


In [64]:
### Define the path to the file (a string in a variable 'db_file')
db_file = '../database/cortona_week.db'

In [65]:
### Créer une connexion vers la base de données
# Attention: si la base de données n'existe pas, elle est créée à l'endroit indiqué.
# Ce qui est une manière de la créer si souhaité
try:
    cn = sql.connect(db_file)
except Exception as e:
    print(e)    
# Le contenu de la variable: connexion
# cn

In [66]:
### La requête à effectuer
# Préparée généralement directement sur la BD SQLite
# Elle produit la liste des années de naissance,
# une année par individu
q_categories = """
SELECT 
	c.pk_categories,
    c.categories
FROM
	Categories c  
"""
q_topics = """
SELECT 
	t.pk_topics,
    t.fk_workshop,
    t.fk_categories,
    C.categories,
    C.meta_category,
    w.name
    
FROM
	Topics t
JOIN
	Categories C 
ON
	t.fk_categories = C.pk_categories 
JOIN 
	Workshop w 
on
	t.fk_workshop = w.pk_workshop  
"""


In [67]:
### Création du conteneur du résultat de la requête
cur = cn.cursor()
cur

<sqlite3.Cursor at 0x73680a063ac0>

In [68]:
### Exécuter la requête et récupérer le résultat
cur.execute(q_categories)
# La méthode 'fetchall' permet de récupérer toutes les lignes
data_categories = cur.fetchall()

cur.execute(q_topics)
# La méthode 'fetchall' permet de récupérer toutes les lignes
data_topics = cur.fetchall()


In [69]:
pd_categorie=pd.DataFrame(data_categories, columns=['pk_categories','categories'])
pd_topics=pd.DataFrame(data_topics, columns=['pk_topics','fk_workshop','fk_categories','categories','meta_category','workshops'])

pd_topics['values_for_pivot']=1
#pd_categorie=pd_categorie.astype('int')
#pd_topics=pd_topics.astype('int')
pd.set_option("display.max_rows", 10)

display(pd_topics)
number_of_workshops=pd_topics['fk_workshop'].nunique()
number_of_categories=pd_topics['fk_categories'].nunique()

###remove category 28 to see wether high overlap reduces explainbaility
#pd_topics=pd_topics[pd_topics['fk_categories']!=28]

contingency_table = pd.pivot_table(pd_topics, values='values_for_pivot', index=['fk_workshop'],
                       columns=['meta_category'], aggfunc="sum", fill_value=0)

display(pd_topics)

display(contingency_table)



ca = prince.CA(
    n_components=6,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='sklearn',
    random_state=42
)
ca = ca.fit(contingency_table)


Unnamed: 0,pk_topics,fk_workshop,fk_categories,categories,meta_category,workshops,values_for_pivot
0,1,20,1,Bewusstseinserweiterung,5,Halluzinogene und veränderte Zustände des Bewu...,1
1,2,20,2,das Unbewusste,5,Halluzinogene und veränderte Zustände des Bewu...,1
2,3,21,1,Bewusstseinserweiterung,5,Die Beziehung zwischen Körper und Seele,1
3,4,21,2,das Unbewusste,5,Die Beziehung zwischen Körper und Seele,1
4,5,27,3,Hochpotenzforschung,6,Homöopathische Medizin Heute,1
...,...,...,...,...,...,...,...
1008,1011,265,34,Musik,4,Morgensingen,1
1009,1012,265,114,Chaostheorie,2,Morgensingen,1
1010,1013,265,104,Träume,5,Morgensingen,1
1011,1014,265,60,Improvisation,7,Morgensingen,1


Unnamed: 0,pk_topics,fk_workshop,fk_categories,categories,meta_category,workshops,values_for_pivot
0,1,20,1,Bewusstseinserweiterung,5,Halluzinogene und veränderte Zustände des Bewu...,1
1,2,20,2,das Unbewusste,5,Halluzinogene und veränderte Zustände des Bewu...,1
2,3,21,1,Bewusstseinserweiterung,5,Die Beziehung zwischen Körper und Seele,1
3,4,21,2,das Unbewusste,5,Die Beziehung zwischen Körper und Seele,1
4,5,27,3,Hochpotenzforschung,6,Homöopathische Medizin Heute,1
...,...,...,...,...,...,...,...
1008,1011,265,34,Musik,4,Morgensingen,1
1009,1012,265,114,Chaostheorie,2,Morgensingen,1
1010,1013,265,104,Träume,5,Morgensingen,1
1011,1014,265,60,Improvisation,7,Morgensingen,1


meta_category,1,2,3,4,5,6,7
fk_workshop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11,0,0,3,0,1,0,0
12,0,0,3,0,0,0,0
13,0,0,1,0,0,0,0
14,2,0,1,0,0,0,0
15,0,0,3,0,0,0,0
...,...,...,...,...,...,...,...
261,1,0,0,1,1,0,2
262,0,3,0,0,1,0,0
263,0,0,1,0,1,2,0
264,0,0,1,0,2,2,2


In [71]:
ca.eigenvalues_summary
ca.plot(
    contingency_table,
    x_component=0,
    y_component=1,
    show_row_markers=True,
    show_column_markers=True,
    show_row_labels=False,
    show_column_labels=False
)

In [77]:
ca.plot(
    contingency_table,
    x_component=0,
    y_component=5,
    show_row_markers=False,
    show_column_markers=False,
    show_row_labels=False,
    show_column_labels=True
)

In [72]:
ca.eigenvalues_summary

Unnamed: 0_level_0,eigenvalue,% of variance,% of variance (cumulative)
component,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.529,26.05%,26.05%
1,0.429,21.11%,47.16%
2,0.371,18.27%,65.44%
3,0.328,16.13%,81.56%
4,0.212,10.42%,91.99%
5,0.163,8.01%,100.00%


In [52]:
ca.row_contributions_.head().style.format('{:.0%}')

Unnamed: 0,0,1,2,3,4,5
11,0%,1%,0%,0%,1%,0%
12,0%,1%,0%,1%,0%,1%
13,0%,0%,0%,0%,0%,0%
14,1%,0%,2%,0%,0%,0%
15,0%,1%,0%,1%,0%,1%


In [75]:
ca.column_contributions_.style.format('{:.0%}')

Unnamed: 0,0,1,2,3,4,5
1,19%,54%,11%,5%,0%,1%
2,37%,10%,35%,8%,0%,0%
3,6%,15%,20%,31%,1%,0%
4,4%,15%,32%,30%,3%,9%
5,13%,1%,0%,7%,43%,14%
6,12%,2%,0%,19%,53%,3%
7,7%,2%,1%,0%,1%,74%
