In [1]:
import pandas as pd
import numpy as np
from time import time

import matplotlib.pyplot as plt

from ipywidgets import interact, fixed
from IPython.display import display

import pickle

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

schwartz_hier = {
    'self-transcendence': ['universalism', 'benevolence'],
    'conservation': ['conformity', 'tradition','security'],
    'self-enhancement': ['power', 'achievement'],
    'hhedonism': ['hedonism'],
    'opennes-to-change': ['stimulation','self-direction']
}

schwartz_hier_pos = {
    'universalism': 'self-transcendence',
    'benevolence': 'self-transcendence',
    'conformity': 'conservation',
    'tradition': 'conservation',
    'security': 'conservation',
    'power': 'self-enhancement',
    'achievement': 'self-enhancement',
    'hedonism': 'hhedonism',
    'stimulation': 'opennes-to-change',
    'self-direction': 'opennes-to-change'
}

schwartz_hier_neg = {
    'universalism': 'self-enhancement',
    'benevolence': 'self-enhancement',
    'conformity': 'opennes-to-change',
    'tradition': 'opennes-to-change',
    'security': 'opennes-to-change',
    'power': 'self-transcendence',
    'achievement': 'self-transcendence',
    'hedonism': '',
    'stimulation': 'conservation',
    'self-direction': 'conservation'
}

In [4]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    data.reset_index(drop=True,inplace=True)
    
    return data

def initialize(brown_corpus = False):
    filepath = 'pruned_schwartz.json'
    data = read_data(filepath)

    if brown_corpus:
        all_W_norm = pickle.load(open( "loo_all_W_norm_brown.p", "rb" ) )
    else:
        all_W_norm = pickle.load(open( "loo_all_W_norm.p", "rb" ) )
        
    all_W_norm = np.asarray(all_W_norm)
    schwartz_dist = []
    for doc in range(len(all_W_norm)):
        temp_dist = []
        for sch in schwartz:
            temp_dist.append(all_W_norm[doc][categories.index(sch)])
        schwartz_dist.append(temp_dist)
    schwartz_dist = np.asarray(schwartz_dist)

    df = pd.DataFrame(data=schwartz_dist,index = range(len(schwartz_dist)), columns=schwartz)
    df['document.id'] = data['document.id']
    df['title'] = data['title']
    df['theme'] = data['theme']

    cols = df.columns.tolist()
    cols = cols[-3:] + cols[:-3]
    df = df[cols]
    
    return df

In [5]:
def make_analysis():
    groupMaxTheme = dict(((s, dict(((s, [0,0]) for s in schwartz+['_doc_count']))) for s in schwartz))
    groupMinTheme = dict(((s, dict(((s, [0,0]) for s in schwartz+['_doc_count']))) for s in schwartz))

    groupMaxHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))
    groupMinHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))

    hierGroupMaxHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))
    hierGroupMinHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))

    for idx, row in df2.iterrows():
        theme = row['theme']

        groupMaxTheme[theme]['_doc_count'][0] += 1
        groupMaxHier[theme]['_doc_count'][0] += 1
        hierGroupMaxHier[schwartz_hier_pos[theme]]['_doc_count'][0] += 1

        groupMinTheme[theme]['_doc_count'][0] += 1
        groupMinHier[theme]['_doc_count'][0] += 1
        hierGroupMinHier[schwartz_hier_pos[theme]]['_doc_count'][0] += 1

        for mx_theme in row['max_themes_th'+str(max_th)]:
            groupMaxTheme[theme][mx_theme[0]][0] += 1
            groupMaxHier[theme][schwartz_hier_pos[mx_theme[0]]][0] += 1
            hierGroupMaxHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mx_theme[0]]][0] += 1
        for mn_theme in row['min_themes_th'+str(min_th)]:
            groupMinTheme[theme][mn_theme[0]][0] += 1
            groupMinHier[theme][schwartz_hier_pos[mn_theme[0]]][0] += 1
            hierGroupMinHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mn_theme[0]]][0] += 1

    theme_threshold_count = df2.groupby('theme').sum()['theme_threshold_th'+str(theme_th)].astype(int).rename('theme_count')
    theme_threshold_count = pd.concat([df.groupby('theme').count()['title'].rename('_doc_count'), theme_threshold_count], axis=1)
    
    
    
    for k, v in groupMaxTheme.items():
        for k2, v2 in v.items():
            groupMaxTheme[k][k2][1] = np.round(groupMaxTheme[k][k2][0]/groupMaxTheme[k]['_doc_count'][0], 2)
            groupMinTheme[k][k2][1] = np.round(groupMinTheme[k][k2][0]/groupMinTheme[k]['_doc_count'][0], 2)

    for k, v in groupMaxHier.items():
        for k2, v2 in v.items():
            groupMaxHier[k][k2][1] = np.round(groupMaxHier[k][k2][0]/groupMaxHier[k]['_doc_count'][0], 2)
            groupMinHier[k][k2][1] = np.round(groupMinHier[k][k2][0]/groupMinHier[k]['_doc_count'][0], 2)

    for k, v in hierGroupMaxHier.items():
        for k2, v2 in v.items():
            hierGroupMaxHier[k][k2][1] = np.round(hierGroupMaxHier[k][k2][0]/hierGroupMaxHier[k]['_doc_count'][0], 2)
            hierGroupMinHier[k][k2][1] = np.round(hierGroupMinHier[k][k2][0]/hierGroupMinHier[k]['_doc_count'][0], 2)

    theme_threshold_count['theme_percentage'] = [0.0]*len(theme_threshold_count)
    for idx, row in theme_threshold_count.iterrows():
        theme_threshold_count.at[idx, 'theme_percentage'] = np.round(row['theme_count']/row['_doc_count'], 2)
        
    return groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count

def deprecated_analysis():
    groupMaxTheme = dict(((s, dict(((s, 0) for s in schwartz+['_doc_count']))) for s in schwartz))
    groupMinTheme = dict(((s, dict(((s, 0) for s in schwartz+['_doc_count']))) for s in schwartz))

    groupMaxHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))
    groupMinHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))

    hierGroupMaxHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))
    hierGroupMinHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))

    for idx, row in df2.iterrows():
        theme = row['theme']

        groupMaxTheme[theme]['_doc_count'] += 1
        groupMaxHier[theme]['_doc_count'] += 1
        hierGroupMaxHier[schwartz_hier_pos[theme]]['_doc_count'] += 1

        groupMinTheme[theme]['_doc_count'] += 1
        groupMinHier[theme]['_doc_count'] += 1
        hierGroupMinHier[schwartz_hier_pos[theme]]['_doc_count'] += 1

        for mx_theme in row['max_themes_th'+str(max_th)]:
            groupMaxTheme[theme][mx_theme[0]] += 1
            groupMaxHier[theme][schwartz_hier_pos[mx_theme[0]]] += 1
            hierGroupMaxHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mx_theme[0]]] += 1
        for mn_theme in row['min_themes_th'+str(min_th)]:
            groupMinTheme[theme][mn_theme[0]] += 1
            groupMinHier[theme][schwartz_hier_pos[mn_theme[0]]] += 1
            hierGroupMinHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mn_theme[0]]] += 1

    theme_threshold_count = df2.groupby('theme').sum()['theme_threshold_th'+str(theme_th)].astype(int).rename('theme_count')
    theme_threshold_count = pd.concat([df.groupby('theme').count()['title'].rename('_doc_count'), theme_threshold_count], axis=1)
    
    return groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count

In [25]:
def show_results(res, groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count):
    print(color.BLUE + "Themes are the Schwartz basic human values." + color.END)
    print(color.BLUE + "HighThemes are higher order Schwartz basic human value groups." + color.END)
    print()
    print(color.BLUE + "Lists in the tables' cells represents: " + color.BOLD + "[Counts, Percentage]" + color.END)
    
    print()
    for k, v in schwartz_hier.items():
        print(color.BOLD + k + color.END + ": " + str(v))
    print()
    print(60*"*")
    print()
    
    if res == 0:
        print(color.BOLD + "Group Max Theme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have higher scores than document's max theme score minus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (max_th) = " + str(max_th) + color.END)
        display(pd.DataFrame(groupMaxTheme).transpose())        
    elif res == 1:
        print(color.BOLD + "Group Min Theme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have lower scores than document's min theme score plus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (min_th) = " + str(min_th) + color.END)
        display(pd.DataFrame(groupMinTheme).transpose())
    elif res == 2:
        print(color.BOLD + "Group Max HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have higher score than document's max theme score minus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (max_th) = " + str(max_th) + color.END)
        display(pd.DataFrame(groupMaxHier).transpose())
    elif res == 3:
        print(color.BOLD + "Group Min HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have lower scores than document's min theme score plus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (min_th) = " + str(min_th) + color.END)
        display(pd.DataFrame(groupMinHier).transpose())
    elif res == 4:
        print(color.BOLD + "HighTheme Group Max HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have higher score than document's max theme score minus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their HighThemes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (max_th) = " + str(max_th) + color.END)
        display(pd.DataFrame(hierGroupMaxHier).transpose())
    elif res == 5:
        print(color.BOLD + "HighTheme Group Min HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have lower scores than document's min theme score plus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their HighThemes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (min_th) = " + str(min_th) + color.END)
        display(pd.DataFrame(hierGroupMinHier).transpose())
    else:
        print(color.BOLD + "Theme Threshold Count" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds documents that have higher its own theme score than the threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Theme Threshold (theme_th) = " + str(theme_th) + color.END)
        display(pd.DataFrame(theme_threshold_count))
        

In [7]:
def rowMaxThemes(row, th):
    tScores = row[3:].astype(np.float64)
    sMax = tScores.max()
    maxThemes = tScores[(tScores<=sMax) & (tScores>=sMax-th)]
    
    #return list(maxThemes.keys())
    return list(zip(list(maxThemes.keys()), maxThemes.values))

def rowMinThemes(row, th):
    tScores = row[3:].astype(np.float64)
    sMin = tScores.min()
    minThemes = tScores[(tScores>=sMin) & (tScores<=sMin+th)]
    
    #return list(minThemes.keys())
    return list(zip(list(minThemes.keys()), minThemes.values))

def themeThreshold(row, th):
    theme = str(row[2])
    if row[theme] >= th:
        return True
    else:
        return False

## Initialize

In [19]:
df = initialize(brown_corpus = False)
df.head()

Unnamed: 0,document.id,title,theme,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction
0,1,Critical thinking,universalism,49.52267,47.604883,50.989633,39.579652,28.752928,30.702778,61.13089,7.447772,0.248711,51.70662
1,221,Social work,universalism,73.956331,24.425526,12.386535,3.048106,66.451233,27.560275,59.86546,10.737655,20.006899,19.260099
2,222,Labor rights,universalism,76.822639,62.819241,6.305191,0.00012,42.287528,45.281959,46.115006,0.196758,86.278109,26.479325
3,223,Left-wing politics,universalism,66.274238,13.560333,0.512608,22.805877,16.102093,81.30109,39.825376,6.82877,29.786765,50.355647
4,224,Climate justice,universalism,96.270829,0.121169,0.066082,1.235794,48.480765,0.5928,57.491868,0.793439,28.115263,20.107254


## Analysis

Adjust thereshold values - Explanations are in the interactable plots

In [20]:
df2 = df.copy()

max_th = 5
df2['max_themes_th'+str(max_th)] = df.apply(rowMaxThemes, axis=1, args=(max_th,))
min_th = 5
df2['min_themes_th'+str(min_th)] = df.apply(rowMinThemes, axis=1, args=(min_th,))
theme_th = 80
df2['theme_threshold_th'+str(theme_th)] = df.apply(themeThreshold, axis=1, args=(theme_th,))

In [21]:
df2

Unnamed: 0,document.id,title,theme,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction,max_themes_th5,min_themes_th5,theme_threshold_th80
0,1,Critical thinking,universalism,49.522670,47.604883,50.989633,39.579652,28.752928,30.702778,61.130890,7.447772,0.248711,51.706620,"[(achievement, 61.130889817466915)]","[(stimulation, 0.24871079848182515)]",False
1,221,Social work,universalism,73.956331,24.425526,12.386535,3.048106,66.451233,27.560275,59.865460,10.737655,20.006899,19.260099,"[(universalism, 73.95633053412107)]","[(tradition, 3.0481061974002466)]",False
2,222,Labor rights,universalism,76.822639,62.819241,6.305191,0.000120,42.287528,45.281959,46.115006,0.196758,86.278109,26.479325,"[(stimulation, 86.27810920333991)]","[(tradition, 0.00011991054647983188), (hedonis...",False
3,223,Left-wing politics,universalism,66.274238,13.560333,0.512608,22.805877,16.102093,81.301090,39.825376,6.828770,29.786765,50.355647,"[(power, 81.30108985872236)]","[(conformity, 0.5126084542927725)]",False
4,224,Climate justice,universalism,96.270829,0.121169,0.066082,1.235794,48.480765,0.592800,57.491868,0.793439,28.115263,20.107254,"[(universalism, 96.27082914850236)]","[(benevolence, 0.12116926539076914), (conformi...",True
5,225,Labour law,universalism,68.391937,12.125260,22.560340,12.962435,55.590560,43.972425,71.510394,4.198938,23.552645,23.236121,"[(universalism, 68.39193651011864), (achieveme...","[(hedonism, 4.198938307948906)]",False
6,226,Right to housing,universalism,80.427772,7.550699,73.616564,0.003292,82.750115,31.816324,51.850497,0.000159,87.574089,0.146260,"[(security, 82.75011516567835), (stimulation, ...","[(tradition, 0.0032923163340555475), (hedonism...",True
7,227,Social law,universalism,75.720724,26.056795,17.685650,0.229041,54.938594,5.781897,74.721615,9.807221,8.630734,47.604617,"[(universalism, 75.72072438930273), (achieveme...","[(tradition, 0.22904120901640293)]",False
8,228,Solidarity,universalism,32.287511,66.735757,62.146231,30.092037,74.499337,66.413071,60.312410,17.557381,61.779806,5.848783,"[(security, 74.49933674122067)]","[(self-direction, 5.848782511882321)]",False
9,229,Essentially contested concept,universalism,35.662750,85.184936,41.548259,50.947351,30.489266,40.418570,42.291413,39.418567,1.529226,15.101290,"[(benevolence, 85.1849357723987)]","[(stimulation, 1.5292260146643708)]",False


In [22]:
groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count = make_analysis()

In [26]:
interact(show_results,
         res = (0, 6, 1),
         groupMaxTheme=fixed(groupMaxTheme),
         groupMinTheme=fixed(groupMinTheme),
         groupMaxHier=fixed(groupMaxHier),
         groupMinHier=fixed(groupMinHier),
         hierGroupMaxHier=fixed(hierGroupMaxHier),
         hierGroupMinHier=fixed(hierGroupMinHier),
         theme_threshold_count=fixed(theme_threshold_count))

<function __main__.show_results>