# World Values Survey Data Notebook - Child Traits

In [2]:
import numpy as np
import pandas as pd
import pyreadr
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Acquiring Data
The data was acquired from the World Values Survey at https://www.worldvaluessurvey.org/wvs.jsp. Wave 1 through 6 were in .rds format while Wave 7 was in .rdata format.

In [4]:
# Use pyreadr to read .rds files to DataFrame
wv1_6 = [ pyreadr.read_r(f'../rdata/wv{i}.rds')[None] for i in range(1, 7) ]

In [5]:
# RData converted from .rdata file to CSV with RStudio
wv7 = pd.read_csv('../rdata/wv7.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


### Traits that should be encouraged in children (Wave 7, Q7-17)
- List of Traits (choose 5): good manners, independence, hard work, feeling of responsibility, imagination, tolerance and respect for other people, thrift/saving money, determination/perseverance, religious faith, not being selfish, obedience

In [6]:
df = wv7[['B_COUNTRY_ALPHA', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17']].copy()

In [7]:
traits_dict = {
    'B_COUNTRY_ALPHA': 'country',
    'Q7': 'manners',
    'Q8': 'independence',
    'Q9': 'hard work',
    'Q10': 'responsibility',
    'Q11': 'imagination',
    'Q12': 'tolerance',
    'Q13': 'thrift',
    'Q14': 'determination',
    'Q15': 'faith',
    'Q16': 'unselfishness',
    'Q17': 'obedience',
}

def convert_bin(x):
    if x == 1:
        return 1
    else:
        return 0

In [8]:
# Clean column names and convert to binary
df.columns = df.columns.map(traits_dict)
df.iloc[:, 1:] = df.iloc[:, 1:].applymap(convert_bin)

In [9]:
# Set data columns
X = df.drop(columns='country')

In [10]:
# Function to create cluster labels
def kmeans_model(n, X):
    model = KMeans(n_clusters=n)
    model.fit(X)
    labels = model.labels_
    return labels

In [11]:
# scores = []
# c_range = range(2, 50)

# for i in c_range:
#     labels = kmeans_model(i, X)
#     score = silhouette_score(X, labels)
#     scores.append(score)

In [12]:
# clusters = list(zip(c_range, scores))
# max(clusters, key=lambda x:x[1]) # n-clusters = 25

In [13]:
labels = kmeans_model(25, X)

In [14]:
df['cluster'] = labels

In [15]:
df.groupby('cluster').mean()

Unnamed: 0_level_0,manners,independence,hard work,responsibility,imagination,tolerance,thrift,determination,faith,unselfishness,obedience
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.870328,0.363715,0.984186,0.901093,0.06728,0.0,0.12881,0.108396,0.876941,0.253594,0.221967
1,0.849766,0.216782,0.484531,0.0,0.1144,0.636546,0.212553,0.0,1.0,0.287113,0.977075
2,0.955602,0.887576,0.0,0.931212,0.134146,0.983422,0.297256,0.310785,0.228659,0.0,0.08346
3,0.144633,0.912241,0.151412,0.541243,0.79435,0.647458,0.327307,0.887759,0.177024,0.095292,0.087006
4,0.90776,0.133602,0.614934,0.681186,1.0,0.407394,0.176794,0.674963,0.074671,0.074305,0.071742
5,0.849193,0.345242,0.806252,0.0,0.196496,0.852284,0.965648,0.102027,0.340089,0.079011,0.162487
6,0.862455,1.0,1.0,0.710966,0.113247,0.927319,0.0,0.066765,0.125502,0.001268,0.055567
7,0.857332,0.077675,0.232497,0.340291,0.056011,0.752708,0.112285,1.0,0.696169,0.146103,0.660238
8,0.988395,0.882012,0.961315,0.982592,0.756286,0.983881,0.944552,0.883946,0.852998,0.940039,0.945841
9,0.955923,0.077824,0.759298,0.475207,0.152893,0.960744,0.194904,0.078512,0.108815,1.0,0.102273


In [316]:
df

Unnamed: 0,country,manners,independence,hard work,responsibility,imagination,tolerance,thrift,determination,faith,unselfishness,obedience,cluster
1,AND,1,1,0,1,1,0,0,0,0,0,1,21
2,AND,1,0,1,1,0,1,0,0,0,0,1,4
3,AND,0,1,0,1,1,1,0,1,0,0,0,0
4,AND,1,0,0,1,0,1,0,1,0,0,1,14
5,AND,1,0,1,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70863,ZWE,1,0,1,0,1,0,0,0,1,0,0,6
70864,ZWE,1,0,1,0,0,1,0,0,1,0,0,13
70865,ZWE,1,0,1,0,0,0,0,1,0,1,1,16
70866,ZWE,1,1,1,0,0,1,0,1,0,0,0,1


In [None]:
# Function to group responses by country
def qbc(df, question, urbrural=True):
    if urbrural == True:
        q = df.groupby(['B_COUNTRY_ALPHA', 'H_URBRURAL'])[question].mean()
    else:
        q = df.groupby(['B_COUNTRY_ALPHA'])[question].mean()
    return pd.DataFrame(q).unstack()