In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [49]:
rawdat = pd.read_csv('../data/input/Auschwitz_segments_story_end_beginning_distinguished.csv')
rawdat.shape

(94394, 13)

In [50]:
biodata = pd.read_csv('../data/input/biodata_birkenau.csv')

In [51]:
# filter only used interviews from biodata_birkenau.csv
relevant_rawdat = rawdat[rawdat.IntCode.isin(biodata['IntCode'])]
relevant_rawdat.shape

(46820, 13)

In [52]:
# create a keyword ID - Label map
kwID_kwLabel_map = relevant_rawdat[['KeywordID', 'KeywordLabel']]
kwID_kwLabel_map.drop_duplicates(inplace=True)
kwID_kwLabel_map.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
kwID_kwLabel_map

Unnamed: 0,KeywordID,KeywordLabel
0,88888,story_beginning
1,7601,Auschwitz II-Birkenau (Poland : Death Camp)
2,12044,camp selections
3,14280,loved ones' separations
4,13310,"Oświęcim (Kraków, Poland)"
...,...,...
693,31294,abandonment (emotion)
694,17194,attitudes toward Arab States and/or Arabs
695,12111,acculturation
696,13307,working life


In [54]:
# find kwIds, that are in the first and last segments of relevant interviews
first_kwIDs = []
last_kwIDs = []

for intcode in pd.unique(relevant_rawdat.IntCode):
    interview = relevant_rawdat[relevant_rawdat.IntCode==intcode]
    
    # identify first and alst segments
    # extract KwID
    first_kwID = interview[interview.SegmentNumber == interview.SegmentNumber.min()].KeywordID
    last_kwID = interview[interview.SegmentNumber == interview.SegmentNumber.max()].KeywordID
    print (first_kwID)
    first_kwIDs.append(first_kwID.to_numpy())
    last_kwIDs.append(last_kwID.to_numpy())
    
# concatenate
first_kwIDs = np.concatenate(first_kwIDs)
last_kwIDs = np.concatenate(last_kwIDs)

0    88888
Name: KeywordID, dtype: int64
39    88888
Name: KeywordID, dtype: int64
138    88888
139    12044
140    16503
141    25452
Name: KeywordID, dtype: int64
160    88888
161    10852
162    14049
Name: KeywordID, dtype: int64
391    88888
Name: KeywordID, dtype: int64
459    88888
Name: KeywordID, dtype: int64
553    88888
Name: KeywordID, dtype: int64
648    88888
Name: KeywordID, dtype: int64
874    88888
Name: KeywordID, dtype: int64
1017    88888
1018    12044
1019    14280
Name: KeywordID, dtype: int64
1145    88888
Name: KeywordID, dtype: int64
1257    88888
Name: KeywordID, dtype: int64
1341    88888
Name: KeywordID, dtype: int64
1395    88888
Name: KeywordID, dtype: int64
1478    88888
1479    12044
Name: KeywordID, dtype: int64
1597    88888
Name: KeywordID, dtype: int64
1719    88888
1720    34358
Name: KeywordID, dtype: int64
1758    88888
1759    14280
Name: KeywordID, dtype: int64
1820    88888
1821    16192
Name: KeywordID, dtype: int64
1835    88888
Name: Keyword

23152    88888
23153    15088
Name: KeywordID, dtype: int64
23200    88888
Name: KeywordID, dtype: int64
23274    88888
23275    10853
Name: KeywordID, dtype: int64
23415    88888
23416    14307
Name: KeywordID, dtype: int64
23604    88888
23605    10853
Name: KeywordID, dtype: int64
23721    88888
Name: KeywordID, dtype: int64
23748    88888
23749    12044
Name: KeywordID, dtype: int64
23820    88888
23821    10926
23822    13942
Name: KeywordID, dtype: int64
23958    88888
Name: KeywordID, dtype: int64
24139    88888
Name: KeywordID, dtype: int64
24204    88888
24205    14049
24206    14917
Name: KeywordID, dtype: int64
24321    88888
Name: KeywordID, dtype: int64
24491    88888
24492    13214
24493    15233
Name: KeywordID, dtype: int64
24561    88888
24562    12044
24563     4047
Name: KeywordID, dtype: int64
25220    88888
25221    10853
Name: KeywordID, dtype: int64
25305    88888
Name: KeywordID, dtype: int64
25875    88888
Name: KeywordID, dtype: int64
26218    88888
Name: Keyw

46550    88888
Name: KeywordID, dtype: int64
46596    88888
46597     3732
Name: KeywordID, dtype: int64
46647    88888
Name: KeywordID, dtype: int64
46827    88888
Name: KeywordID, dtype: int64
47000    88888
Name: KeywordID, dtype: int64
47038    88888
Name: KeywordID, dtype: int64
47114    88888
Name: KeywordID, dtype: int64
47194    88888
47195    16451
47196    14293
Name: KeywordID, dtype: int64
47302    88888
47303    12044
47304    14280
Name: KeywordID, dtype: int64
47953    88888
Name: KeywordID, dtype: int64
48071    88888
Name: KeywordID, dtype: int64
48147    88888
Name: KeywordID, dtype: int64
48325    88888
Name: KeywordID, dtype: int64
48550    88888
48551     7515
Name: KeywordID, dtype: int64
48731    88888
Name: KeywordID, dtype: int64
48764    88888
48765    14280
Name: KeywordID, dtype: int64
48978    88888
Name: KeywordID, dtype: int64
48998    88888
Name: KeywordID, dtype: int64
49024    88888
Name: KeywordID, dtype: int64
49101    88888
Name: KeywordID, dtype: i

70510    88888
Name: KeywordID, dtype: int64
70579    88888
Name: KeywordID, dtype: int64
70724    88888
70725    14280
Name: KeywordID, dtype: int64
70794    88888
Name: KeywordID, dtype: int64
70894    88888
Name: KeywordID, dtype: int64
70931    88888
70932    12044
Name: KeywordID, dtype: int64
70968    88888
Name: KeywordID, dtype: int64
70985    88888
70986    10853
Name: KeywordID, dtype: int64
71031    88888
Name: KeywordID, dtype: int64
71562    88888
Name: KeywordID, dtype: int64
71586    88888
71587    12044
71588    14280
Name: KeywordID, dtype: int64
72061    88888
72062    14280
Name: KeywordID, dtype: int64
72088    88888
Name: KeywordID, dtype: int64
72146    88888
Name: KeywordID, dtype: int64
72415    88888
Name: KeywordID, dtype: int64
72487    88888
72488    12044
Name: KeywordID, dtype: int64
72795    88888
72796    15223
72797    15774
Name: KeywordID, dtype: int64
72830    88888
Name: KeywordID, dtype: int64
72847    88888
Name: KeywordID, dtype: int64
72911    8

In [55]:
# bincount (histogram topics in both cases)
bc_first = np.bincount(first_kwIDs, minlength=kwID_kwLabel_map.KeywordID.max())
bc_last = np.bincount(last_kwIDs, minlength=kwID_kwLabel_map.KeywordID.max())

In [56]:
# argsort (backwards) to get highest populated states
idx_first = bc_first.argsort()[::-1]
idx_last = bc_last.argsort()[::-1]

# printed number of topics (only printout)
print_top_n = 15

In [57]:
# left is prevalence of a topic, i.e. fraction of first frames that have this topic
# second is topic, followed by keywordID in brackets.
print('first topics')
for idx in idx_first[:print_top_n]:
    print(f'{np.round(bc_first[idx]/bc_first.sum(), 2)}: '+\
          f'{kwID_kwLabel_map[kwID_kwLabel_map.KeywordID == idx].KeywordLabel.values[0]}' +\
          f'[KeywordID: {idx}]'
         )

first topics
0.62: story_beginning[KeywordID: 88888]
0.09: camp selections[KeywordID: 12044]
0.04: camp intake procedures[KeywordID: 10853]
0.03: loved ones' separations[KeywordID: 14280]
0.01: loved ones' final contacts[KeywordID: 11672]
0.01: Mengele, Josef[KeywordID: 4047]
0.01: Czechoslovakia 1944[KeywordID: 13214]
0.01: freight trains[KeywordID: 15774]
0.01: extended family members[KeywordID: 13819]
0.01: mass murder awareness[KeywordID: 10698]
0.01: Poland 1945 (January 1 - May 7)[KeywordID: 16192]
0.01: deportations, means of transport[KeywordID: 15223]
0.01: Hungary 1944[KeywordID: 14049]
0.01: deportation conditions[KeywordID: 10852]


In [58]:
print('last topics')
for idx in idx_last[:print_top_n]:
    print(f'{np.round(bc_last[idx]/bc_last.sum(), 2)}: '+\
          f'{kwID_kwLabel_map[kwID_kwLabel_map.KeywordID == idx].KeywordLabel.values[0]}' +\
          f'[KeywordID: {idx}]'
         )

last topics
0.65: story_ending[KeywordID: 9999]
0.03: transfers, means of transport[KeywordID: 15232]
0.02: freight trains[KeywordID: 15774]
0.01: camp adaptation methods[KeywordID: 14380]
0.01: Auschwitz I (Poland : Concentration Camp)[KeywordID: 13018]
0.01: camp selections[KeywordID: 12044]
0.01: Germany 1941 (June 22) - 1945 (May 7)[KeywordID: 13926]
0.01: transfer conditions[KeywordID: 12161]
0.01: mass murder awareness[KeywordID: 10698]
0.01: prisoner tattoos[KeywordID: 15180]
0.01: extended family members[KeywordID: 13819]
0.01: transfer procedures[KeywordID: 22647]
0.0: camp-related aid giving[KeywordID: 16503]
0.0: loved ones' fates[KeywordID: 12126]
0.0: forced marches[KeywordID: 10632]
