In [92]:
import numpy as np
import pandas as pd
import pickle
import networkx as nx
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import FeatureAgglomeration
from sklearn.metrics import silhouette_score

from itertools import combinations

In [2]:
from world_value_functions.wv_functions import apriori

In [3]:
with open('wv7_childhood_df.p', 'rb') as f:
    df = pickle.load(f)

In [4]:
df.drop(columns='traits', inplace=True)

In [5]:
traits = [ trait for trait in df.columns if trait != 'country' ]

In [6]:
combos = [ list(combo) for combo in combinations(traits, 2) ]

In [33]:
apriori(df, country='ALL', selected_traits=['manners'])

Unnamed: 0,A,B,A and B Proportion,A given B Proportion
0,independence,manners,0.3429,0.4197
1,hard work,manners,0.4423,0.5414
2,responsibility,manners,0.5624,0.6885
3,imagination,manners,0.1495,0.183
4,tolerance,manners,0.5335,0.6531
5,thrift,manners,0.2311,0.2829
6,determination,manners,0.2359,0.2888
7,faith,manners,0.3066,0.3753
8,unselfishness,manners,0.2013,0.2464
9,obedience,manners,0.2623,0.321


In [36]:
def a_given_b(df, a=None, b=[]):
    temp = df
    for trait in b:
        temp = temp.loc[temp[trait] == True]
        
    total = len(temp)
    temp = temp.loc[temp[a] == True]
    
    return len(temp) / total

In [85]:
def km_model(X, n):
    model = KMeans(n_clusters=n)
    model.fit(X)
    labels = model.labels_
    score = silhouette_score(X, labels)
    return score, labels

In [100]:
def ac_model(X, n):
    model = AgglomerativeClustering(n_clusters=n)
    model.fit(X)
    labels = model.labels_
    score = silhouette_score(X, labels)
    return score, labels

In [164]:
X = df.groupby('country').mean()

In [165]:
for n in range(2, 15):
    score, _ = km_model(X, n)
    print(f'{n} Clusters: {round(score, 4)}')

2 Clusters: 0.2825
3 Clusters: 0.2162
4 Clusters: 0.2454
5 Clusters: 0.2471
6 Clusters: 0.2128
7 Clusters: 0.2514
8 Clusters: 0.2467
9 Clusters: 0.2642
10 Clusters: 0.2613
11 Clusters: 0.25
12 Clusters: 0.2198
13 Clusters: 0.2513
14 Clusters: 0.2119


In [166]:
model = KMeans(7)
model.fit(X)
labels = model.labels_
X['label'] = labels

In [167]:
X

Unnamed: 0_level_0,manners,independence,hard work,responsibility,imagination,tolerance,thrift,determination,faith,unselfishness,obedience,label
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AND,0.844093,0.482339,0.437272,0.752741,0.302071,0.714982,0.274056,0.321559,0.082826,0.44458,0.343484,4
ARG,0.795683,0.368345,0.61295,0.752518,0.238849,0.709353,0.214388,0.346763,0.192806,0.47482,0.293525,4
AUS,0.836007,0.525847,0.442662,0.579323,0.358883,0.844326,0.211527,0.455734,0.145573,0.416518,0.183601,4
BGD,0.984615,0.325641,0.524786,0.680342,0.102564,0.711966,0.238462,0.111966,0.847009,0.326496,0.146154,6
BOL,0.871579,0.242807,0.49193,0.762105,0.16,0.811228,0.267368,0.209123,0.45193,0.237193,0.494737,2
BRA,0.780195,0.291823,0.592648,0.773443,0.177044,0.687172,0.204051,0.269317,0.411853,0.348087,0.464366,2
CHL,0.841818,0.365455,0.336364,0.730909,0.241818,0.714545,0.26,0.429091,0.129091,0.470909,0.48,4
CHN,0.854437,0.792354,0.729919,0.803227,0.225886,0.608558,0.403016,0.221326,0.013679,0.290775,0.056822,1
COL,0.92607,0.315175,0.258366,0.754086,0.206226,0.817899,0.262257,0.187549,0.505058,0.252918,0.514397,2
CYP,0.89879,0.290429,0.756876,0.738174,0.160616,0.767877,0.229923,0.334433,0.313531,0.271727,0.237624,6


In [175]:
X.groupby('label').mean() - np.array(df.mean())

Unnamed: 0_level_0,manners,independence,hard work,responsibility,imagination,tolerance,thrift,determination,faith,unselfishness,obedience
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,-0.388093,-0.068648,-0.013323,-0.091086,0.155004,-0.128192,0.078615,0.167196,0.163182,0.033063,0.092281
1,-0.005953,0.233649,-0.019228,0.139662,0.092379,0.019689,0.106357,0.0846,-0.3024,-0.100063,-0.248692
2,0.059365,-0.133825,-0.131048,0.037841,-0.03609,0.059757,-0.028189,-0.111916,0.071997,0.015135,0.196974
3,-0.103256,-0.000202,0.226906,0.013406,-0.018623,-0.057938,0.079639,0.134838,-0.152274,-0.039256,-0.083239
4,-0.002312,0.010902,-0.077247,-0.019827,0.076861,0.051621,-0.051805,0.087738,-0.255803,0.161297,0.018576
5,0.064538,-0.172123,0.031656,-0.182579,-0.087619,-0.001577,-0.117107,-0.120378,0.323888,0.021396,0.239905
6,0.071287,-0.021571,0.029578,0.016642,-0.056207,-0.017601,-0.010575,-0.064141,0.189561,-0.051022,-0.085949


In [176]:
X.describe()

Unnamed: 0,manners,independence,hard work,responsibility,imagination,tolerance,thrift,determination,faith,unselfishness,obedience,label
count,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0,49.0
mean,0.810451,0.42405,0.557207,0.694282,0.211651,0.664291,0.310885,0.340451,0.37952,0.271099,0.336112,3.326531
std,0.147247,0.148056,0.162764,0.117979,0.103247,0.106372,0.100376,0.133886,0.24357,0.108491,0.185065,1.908181
min,0.04878,0.115711,0.25058,0.395901,0.031847,0.414781,0.141146,0.053079,0.013679,0.040161,0.027842,0.0
25%,0.784493,0.317365,0.437272,0.651584,0.137874,0.604784,0.238462,0.227,0.129091,0.191138,0.163439,2.0
50%,0.846094,0.402439,0.548736,0.724238,0.177044,0.682609,0.285922,0.327793,0.411853,0.26412,0.343964,3.0
75%,0.89386,0.525847,0.701135,0.761021,0.264151,0.735013,0.377227,0.434783,0.548077,0.344546,0.498471,5.0
max,0.984615,0.792354,0.833914,0.888273,0.523695,0.868712,0.54878,0.635731,0.847009,0.47482,0.645354,6.0


In [170]:
X['label'].value_counts()

6    9
2    9
5    7
4    7
3    7
1    7
0    3
Name: label, dtype: int64

In [171]:
data = X.reset_index()[['country', 'label']]

In [172]:
data

Unnamed: 0,country,label
0,AND,4
1,ARG,4
2,AUS,4
3,BGD,6
4,BOL,2
5,BRA,2
6,CHL,4
7,CHN,1
8,COL,2
9,CYP,6
