In [1]:
import pandas as pd
import numpy as np
from time import time

import matplotlib.pyplot as plt

from ipywidgets import interact, fixed
from IPython.display import display

import pickle

In [2]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [3]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

schwartz_hier = {
    'self-transcendence': ['universalism', 'benevolence'],
    'conservation': ['conformity', 'tradition','security'],
    'self-enhancement': ['power', 'achievement'],
    'hhedonism': ['hedonism'],
    'opennes-to-change': ['stimulation','self-direction']
}

schwartz_hier_pos = {
    'universalism': 'self-transcendence',
    'benevolence': 'self-transcendence',
    'conformity': 'conservation',
    'tradition': 'conservation',
    'security': 'conservation',
    'power': 'self-enhancement',
    'achievement': 'self-enhancement',
    'hedonism': 'hhedonism',
    'stimulation': 'opennes-to-change',
    'self-direction': 'opennes-to-change'
}

schwartz_hier_neg = {
    'universalism': 'self-enhancement',
    'benevolence': 'self-enhancement',
    'conformity': 'opennes-to-change',
    'tradition': 'opennes-to-change',
    'security': 'opennes-to-change',
    'power': 'self-transcendence',
    'achievement': 'self-transcendence',
    'hedonism': '',
    'stimulation': 'conservation',
    'self-direction': 'conservation'
}

In [24]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    data.reset_index(drop=True,inplace=True)
    
    return data

def initialize(brown_corpus = False, lemmatized = True):
    filepath = 'pruned_schwartz.json'
    data = read_data(filepath)

    if brown_corpus:
        all_W_norm = pickle.load(open( "loo_all_W_norm_brown.p", "rb" ) )
    else:
        if lemmatized:
            all_W_norm = pickle.load(open( "loo_all_W_norm_lem.p", "rb" ) )
        else:
            all_W_norm = pickle.load(open( "loo_all_W_norm.p", "rb" ) )
        
    all_W_norm = np.asarray(all_W_norm)
    schwartz_dist = []
    for doc in range(len(all_W_norm)):
        temp_dist = []
        for sch in schwartz:
            temp_dist.append(all_W_norm[doc][categories.index(sch)])
        schwartz_dist.append(temp_dist)
    schwartz_dist = np.asarray(schwartz_dist)

    df = pd.DataFrame(data=schwartz_dist,index = range(len(schwartz_dist)), columns=schwartz)
    df['document.id'] = data['document.id']
    df['title'] = data['title']
    df['theme'] = data['theme']

    cols = df.columns.tolist()
    cols = cols[-3:] + cols[:-3]
    df = df[cols]
    
    return df

In [25]:
def make_analysis(df2, max_th, min_th, theme_th):
    groupMaxTheme = dict(((s, dict(((s, [0,0]) for s in schwartz+['_doc_count']))) for s in schwartz))
    groupMinTheme = dict(((s, dict(((s, [0,0]) for s in schwartz+['_doc_count']))) for s in schwartz))

    groupMaxHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))
    groupMinHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))

    hierGroupMaxHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))
    hierGroupMinHier = dict(((s, dict(((s, [0,0]) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))

    for idx, row in df2.iterrows():
        theme = row['theme']

        groupMaxTheme[theme]['_doc_count'][0] += 1
        groupMaxHier[theme]['_doc_count'][0] += 1
        hierGroupMaxHier[schwartz_hier_pos[theme]]['_doc_count'][0] += 1

        groupMinTheme[theme]['_doc_count'][0] += 1
        groupMinHier[theme]['_doc_count'][0] += 1
        hierGroupMinHier[schwartz_hier_pos[theme]]['_doc_count'][0] += 1

        for mx_theme in row['max_themes_th'+str(max_th)]:
            groupMaxTheme[theme][mx_theme[0]][0] += 1
            groupMaxHier[theme][schwartz_hier_pos[mx_theme[0]]][0] += 1
            hierGroupMaxHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mx_theme[0]]][0] += 1
        for mn_theme in row['min_themes_th'+str(min_th)]:
            groupMinTheme[theme][mn_theme[0]][0] += 1
            groupMinHier[theme][schwartz_hier_pos[mn_theme[0]]][0] += 1
            hierGroupMinHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mn_theme[0]]][0] += 1

    theme_threshold_count = df2.groupby('theme').sum()['theme_threshold_th'+str(theme_th)].astype(int).rename('theme_count')
    theme_threshold_count = pd.concat([df.groupby('theme').count()['title'].rename('_doc_count'), theme_threshold_count], axis=1)
    
    
    
    for k, v in groupMaxTheme.items():
        for k2, v2 in v.items():
            groupMaxTheme[k][k2][1] = np.round(groupMaxTheme[k][k2][0]/groupMaxTheme[k]['_doc_count'][0], 2)
            groupMinTheme[k][k2][1] = np.round(groupMinTheme[k][k2][0]/groupMinTheme[k]['_doc_count'][0], 2)

    for k, v in groupMaxHier.items():
        for k2, v2 in v.items():
            groupMaxHier[k][k2][1] = np.round(groupMaxHier[k][k2][0]/groupMaxHier[k]['_doc_count'][0], 2)
            groupMinHier[k][k2][1] = np.round(groupMinHier[k][k2][0]/groupMinHier[k]['_doc_count'][0], 2)

    for k, v in hierGroupMaxHier.items():
        for k2, v2 in v.items():
            hierGroupMaxHier[k][k2][1] = np.round(hierGroupMaxHier[k][k2][0]/hierGroupMaxHier[k]['_doc_count'][0], 2)
            hierGroupMinHier[k][k2][1] = np.round(hierGroupMinHier[k][k2][0]/hierGroupMinHier[k]['_doc_count'][0], 2)

    theme_threshold_count['theme_percentage'] = [0.0]*len(theme_threshold_count)
    for idx, row in theme_threshold_count.iterrows():
        theme_threshold_count.at[idx, 'theme_percentage'] = np.round(row['theme_count']/row['_doc_count'], 2)
        
    return groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count

def deprecated_analysis():
    groupMaxTheme = dict(((s, dict(((s, 0) for s in schwartz+['_doc_count']))) for s in schwartz))
    groupMinTheme = dict(((s, dict(((s, 0) for s in schwartz+['_doc_count']))) for s in schwartz))

    groupMaxHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))
    groupMinHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in schwartz))

    hierGroupMaxHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))
    hierGroupMinHier = dict(((s, dict(((s, 0) for s in list(schwartz_hier.keys())+['_doc_count']))) for s in list(schwartz_hier.keys())))

    for idx, row in df2.iterrows():
        theme = row['theme']

        groupMaxTheme[theme]['_doc_count'] += 1
        groupMaxHier[theme]['_doc_count'] += 1
        hierGroupMaxHier[schwartz_hier_pos[theme]]['_doc_count'] += 1

        groupMinTheme[theme]['_doc_count'] += 1
        groupMinHier[theme]['_doc_count'] += 1
        hierGroupMinHier[schwartz_hier_pos[theme]]['_doc_count'] += 1

        for mx_theme in row['max_themes_th'+str(max_th)]:
            groupMaxTheme[theme][mx_theme[0]] += 1
            groupMaxHier[theme][schwartz_hier_pos[mx_theme[0]]] += 1
            hierGroupMaxHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mx_theme[0]]] += 1
        for mn_theme in row['min_themes_th'+str(min_th)]:
            groupMinTheme[theme][mn_theme[0]] += 1
            groupMinHier[theme][schwartz_hier_pos[mn_theme[0]]] += 1
            hierGroupMinHier[schwartz_hier_pos[theme]][schwartz_hier_pos[mn_theme[0]]] += 1

    theme_threshold_count = df2.groupby('theme').sum()['theme_threshold_th'+str(theme_th)].astype(int).rename('theme_count')
    theme_threshold_count = pd.concat([df.groupby('theme').count()['title'].rename('_doc_count'), theme_threshold_count], axis=1)
    
    return groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count

In [26]:
def show_results(analysis, max_th, min_th, theme_th, groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count):
    print(color.BLUE + "Themes are the Schwartz basic human values." + color.END)
    print(color.BLUE + "HighThemes are higher order Schwartz basic human value groups." + color.END)
    print()
    print(color.BLUE + "Lists in the tables' cells represents: " + color.BOLD + "[Counts, Percentage]" + color.END)
    
    print()
    for k, v in schwartz_hier.items():
        print(color.BOLD + k + color.END + ": " + str(v))
    print()
    print(60*"*")
    print()
    
    if analysis == 1:
        print(color.BOLD + "Group Max Theme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have higher scores than document's max theme score minus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (max_th) = " + str(max_th) + color.END)
        display(pd.DataFrame(groupMaxTheme).transpose())        
    elif analysis == 2:
        print(color.BOLD + "Group Min Theme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have lower scores than document's min theme score plus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (min_th) = " + str(min_th) + color.END)
        display(pd.DataFrame(groupMinTheme).transpose())
    elif analysis == 3:
        print(color.BOLD + "Group Max HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have higher score than document's max theme score minus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (max_th) = " + str(max_th) + color.END)
        display(pd.DataFrame(groupMaxHier).transpose())
    elif analysis == 4:
        print(color.BOLD + "Group Min HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have lower scores than document's min theme score plus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (min_th) = " + str(min_th) + color.END)
        display(pd.DataFrame(groupMinHier).transpose())
    elif analysis == 5:
        print(color.BOLD + "HighTheme Group Max HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have higher score than document's max theme score minus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their HighThemes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (max_th) = " + str(max_th) + color.END)
        display(pd.DataFrame(hierGroupMaxHier).transpose())
    elif analysis == 6:
        print(color.BOLD + "HighTheme Group Min HighTheme" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds themes that have lower scores than document's min theme score plus threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count found themes according to their HighThemes." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their HighThemes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Threshold (min_th) = " + str(min_th) + color.END)
        display(pd.DataFrame(hierGroupMinHier).transpose())
    else:
        print(color.BOLD + "Theme Threshold Count" + color.END)
        print()
        print(color.BOLD + color.PURPLE + "For each document, finds documents that have higher its own theme score than the threshold." + color.END)
        print(color.BOLD + color.PURPLE + "Then, group and count documents according to their Themes." + color.END)
        
        print()
        print(color.BOLD + color.GREEN + "Theme Threshold (theme_th) = " + str(theme_th) + color.END)
        display(pd.DataFrame(theme_threshold_count))
        
def show_results2(analysis, max_th, min_th, theme_th):
    df2 = df.copy()
    
    df2['max_themes_th'+str(max_th)] = df.apply(rowMaxThemes, axis=1, args=(max_th,))
    df2['min_themes_th'+str(min_th)] = df.apply(rowMinThemes, axis=1, args=(min_th,))
    df2['theme_threshold_th'+str(theme_th)] = df.apply(themeThreshold, axis=1, args=(theme_th,))
    
    groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count = make_analysis(df2, max_th, min_th, theme_th)  
    
    show_results(analysis, max_th, min_th, theme_th, groupMaxTheme, groupMinTheme, groupMaxHier, groupMinHier, hierGroupMaxHier, hierGroupMinHier, theme_threshold_count)


In [27]:
def rowMaxThemes(row, th):
    tScores = row[3:].astype(np.float64)
    sMax = tScores.max()
    maxThemes = tScores[(tScores<=sMax) & (tScores>=sMax-th)]
    
    #return list(maxThemes.keys())
    return list(zip(list(maxThemes.keys()), maxThemes.values))

def rowMinThemes(row, th):
    tScores = row[3:].astype(np.float64)
    sMin = tScores.min()
    minThemes = tScores[(tScores>=sMin) & (tScores<=sMin+th)]
    
    #return list(minThemes.keys())
    return list(zip(list(minThemes.keys()), minThemes.values))

def themeThreshold(row, th):
    theme = str(row[2])
    if row[theme] >= th:
        return True
    else:
        return False

## Initialize

In [28]:
# Make brown_corpus True to enable Brown Corpus as background
df = initialize(brown_corpus = False, lemmatized = True)
df.head()

Unnamed: 0,document.id,title,theme,universalism,benevolence,conformity,tradition,security,power,achievement,hedonism,stimulation,self-direction
0,1,Critical thinking,universalism,29.568933,51.700693,57.113528,3.982561,31.23444,35.920324,73.711357,21.903735,0.307757,78.50854
1,221,Social work,universalism,72.120812,31.232133,29.676408,3.973152,58.427256,7.075574,67.619355,9.14371,13.876487,21.888438
2,222,Labor rights,universalism,85.435,6.189056,12.135752,5.027289,57.568072,49.989692,77.555737,0.002016,83.57484,0.714124
3,223,Left-wing politics,universalism,79.757248,31.325614,24.519276,14.785948,5.931156,64.710629,48.898544,7.453365,44.142172,25.689666
4,224,Climate justice,universalism,87.414846,5.136804,4.011423,0.439151,74.691391,10.218979,71.407762,0.193193,23.510946,14.599857


## Analysis

* **analysis**: Different analysis types.
  1. Group Max Theme
  2. Group Min Theme
  3. Group Max HighTheme
  4. Group Min HighTheme
  5. HighTheme Group Max HighTheme
  6. HighTheme Group Min HighTheme
  7. Theme Threshold Count
 
* **max_th**: Controls accaptence difference from highest score.
* **min_th**: Controls accaptence difference from lowest score.
* **theme_th**: Controls score acceptance threshold for its own theme.  

In [29]:
interact(show_results2,
         analysis = (1, 7, 1),
         max_th = (1, 20 , 1),
         min_th = (1, 20 , 1),
         theme_th = (60, 90 , 1))

<function __main__.show_results2>