In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
rawdat = pd.read_csv('../data/input/Auschwitz_segments_story_end_beginning_distinguished.csv')
rawdat.shape

(94674, 13)

In [9]:
biodata = pd.read_csv('../data/input/biodata_birkenau.csv')

In [10]:
# filter only used interviews from biodata_birkenau.csv
relevant_rawdat = rawdat[rawdat.IntCode.isin(biodata['IntCode'])]
relevant_rawdat.shape

(46909, 13)

In [11]:
# create a keyword ID - Label map
kwID_kwLabel_map = relevant_rawdat[['KeywordID', 'KeywordLabel']]
kwID_kwLabel_map.drop_duplicates(inplace=True)
kwID_kwLabel_map.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
kwID_kwLabel_map

Unnamed: 0,KeywordID,KeywordLabel
0,88888,story_beginning
1,7601,Auschwitz II-Birkenau (Poland : Death Camp)
2,12044,camp selections
3,14280,loved ones' separations
4,13310,"Oświęcim (Kraków, Poland)"
...,...,...
724,31294,abandonment (emotion)
725,17194,attitudes toward Arab States and/or Arabs
726,12111,acculturation
727,13307,working life


In [13]:
# find kwIds, that are in the first and last segments of relevant interviews
first_kwIDs = []
last_kwIDs = []

for intcode in pd.unique(relevant_rawdat.IntCode):
    interview = relevant_rawdat[relevant_rawdat.IntCode==intcode]
    
    # identify first and alst segments
    # extract KwID
    first_kwID = interview[interview.SegmentNumber == interview.SegmentNumber.min()].KeywordID
    last_kwID = interview[interview.SegmentNumber == interview.SegmentNumber.max()].KeywordID
    print (first_kwID)
    first_kwIDs.append(first_kwID.to_numpy())
    last_kwIDs.append(last_kwID.to_numpy())
    
# concatenate
first_kwIDs = np.concatenate(first_kwIDs)
last_kwIDs = np.concatenate(last_kwIDs)

0    88888
Name: KeywordID, dtype: int64
39    88888
Name: KeywordID, dtype: int64
138    88888
139    12044
140    16503
141    25452
Name: KeywordID, dtype: int64
160    88888
161    10852
162    14049
Name: KeywordID, dtype: int64
392    88888
Name: KeywordID, dtype: int64
461    88888
Name: KeywordID, dtype: int64
555    88888
Name: KeywordID, dtype: int64
651    88888
Name: KeywordID, dtype: int64
877    88888
Name: KeywordID, dtype: int64
1021    88888
1022    12044
1023    14280
Name: KeywordID, dtype: int64
1151    88888
Name: KeywordID, dtype: int64
1263    88888
Name: KeywordID, dtype: int64
1347    88888
Name: KeywordID, dtype: int64
1402    88888
Name: KeywordID, dtype: int64
1485    88888
1486    12044
Name: KeywordID, dtype: int64
1605    88888
Name: KeywordID, dtype: int64
1727    88888
1728    34358
Name: KeywordID, dtype: int64
1766    88888
1767    14280
Name: KeywordID, dtype: int64
1828    88888
1829    16192
Name: KeywordID, dtype: int64
1843    88888
Name: Keyword

23828    88888
23829    12044
Name: KeywordID, dtype: int64
23900    88888
23901    10926
23902    13942
Name: KeywordID, dtype: int64
24038    88888
Name: KeywordID, dtype: int64
24219    88888
Name: KeywordID, dtype: int64
24284    88888
24285    14049
24286    14917
Name: KeywordID, dtype: int64
24401    88888
Name: KeywordID, dtype: int64
24572    88888
24573    13214
24574    16109
24575    15233
Name: KeywordID, dtype: int64
24643    88888
24644    12044
24645     4047
Name: KeywordID, dtype: int64
25302    88888
25303    10853
Name: KeywordID, dtype: int64
25387    88888
Name: KeywordID, dtype: int64
25957    88888
Name: KeywordID, dtype: int64
26301    88888
Name: KeywordID, dtype: int64
26463    88888
Name: KeywordID, dtype: int64
26596    88888
Name: KeywordID, dtype: int64
26645    88888
Name: KeywordID, dtype: int64
26715    88888
Name: KeywordID, dtype: int64
26826    88888
Name: KeywordID, dtype: int64
26905    88888
26906    10853
Name: KeywordID, dtype: int64
26934    8

49143    88888
Name: KeywordID, dtype: int64
49169    88888
Name: KeywordID, dtype: int64
49247    88888
Name: KeywordID, dtype: int64
49312    88888
Name: KeywordID, dtype: int64
49375    88888
Name: KeywordID, dtype: int64
49413    88888
Name: KeywordID, dtype: int64
49532    88888
Name: KeywordID, dtype: int64
49690    88888
49691    10853
Name: KeywordID, dtype: int64
50027    88888
Name: KeywordID, dtype: int64
50097    88888
50098    12044
50099    16110
50100    15223
50101    15774
Name: KeywordID, dtype: int64
50156    88888
Name: KeywordID, dtype: int64
50597    88888
Name: KeywordID, dtype: int64
50845    88888
Name: KeywordID, dtype: int64
51317    88888
Name: KeywordID, dtype: int64
51357    88888
Name: KeywordID, dtype: int64
51670    88888
Name: KeywordID, dtype: int64
51856    88888
51857    11672
Name: KeywordID, dtype: int64
51917    88888
51918    12044
51919    13214
Name: KeywordID, dtype: int64
51951    88888
Name: KeywordID, dtype: int64
52238    88888
52239    1

73359    88888
Name: KeywordID, dtype: int64
73640    88888
73641    14330
73642     6861
Name: KeywordID, dtype: int64
73692    88888
Name: KeywordID, dtype: int64
73976    88888
73977    15583
Name: KeywordID, dtype: int64
74087    88888
74088    12044
74089    14280
Name: KeywordID, dtype: int64
74136    88888
Name: KeywordID, dtype: int64
74168    88888
Name: KeywordID, dtype: int64
74435    88888
Name: KeywordID, dtype: int64
74468    88888
Name: KeywordID, dtype: int64
74502    88888
Name: KeywordID, dtype: int64
74719    88888
Name: KeywordID, dtype: int64
74979    88888
Name: KeywordID, dtype: int64
75099    88888
Name: KeywordID, dtype: int64
75169    88888
Name: KeywordID, dtype: int64
75304    88888
Name: KeywordID, dtype: int64
75334    88888
Name: KeywordID, dtype: int64
75556    88888
Name: KeywordID, dtype: int64
75665    88888
Name: KeywordID, dtype: int64
75691    88888
Name: KeywordID, dtype: int64
75947    88888
Name: KeywordID, dtype: int64
76017    88888
Name: Keyw

In [14]:
# bincount (histogram topics in both cases)
bc_first = np.bincount(first_kwIDs, minlength=kwID_kwLabel_map.KeywordID.max())
bc_last = np.bincount(last_kwIDs, minlength=kwID_kwLabel_map.KeywordID.max())

In [15]:
# argsort (backwards) to get highest populated states
idx_first = bc_first.argsort()[::-1]
idx_last = bc_last.argsort()[::-1]

# printed number of topics (only printout)
print_top_n = 15

In [16]:
# left is prevalence of a topic, i.e. fraction of first frames that have this topic
# second is topic, followed by keywordID in brackets.
print('first topics')
for idx in idx_first[:print_top_n]:
    print(f'{np.round(bc_first[idx]/bc_first.sum(), 2)}: '+\
          f'{kwID_kwLabel_map[kwID_kwLabel_map.KeywordID == idx].KeywordLabel.values[0]}' +\
          f'[KeywordID: {idx}]'
         )

first topics
0.62: story_beginning[KeywordID: 88888]
0.08: camp selections[KeywordID: 12044]
0.04: camp intake procedures[KeywordID: 10853]
0.03: loved ones' separations[KeywordID: 14280]
0.01: Mengele, Josef[KeywordID: 4047]
0.01: loved ones' final contacts[KeywordID: 11672]
0.01: Czechoslovakia 1944[KeywordID: 13214]
0.01: freight trains[KeywordID: 15774]
0.01: extended family members[KeywordID: 13819]
0.01: mass murder awareness[KeywordID: 10698]
0.01: Poland 1945 (January 1 - May 7)[KeywordID: 16192]
0.01: deportations, means of transport[KeywordID: 15223]
0.01: Hungary 1944[KeywordID: 14049]
0.01: deportation conditions[KeywordID: 10852]


In [17]:
print('last topics')
for idx in idx_last[:print_top_n]:
    print(f'{np.round(bc_last[idx]/bc_last.sum(), 2)}: '+\
          f'{kwID_kwLabel_map[kwID_kwLabel_map.KeywordID == idx].KeywordLabel.values[0]}' +\
          f'[KeywordID: {idx}]'
         )

last topics
0.6: story_ending[KeywordID: 9999]
0.02: transfers, means of transport[KeywordID: 15232]
0.02: freight trains[KeywordID: 15774]
0.01: camp adaptation methods[KeywordID: 14380]
0.01: camp selections[KeywordID: 12044]
0.01: Auschwitz I (Poland : Concentration Camp)[KeywordID: 13018]
0.01: Germany 1941 (June 22) - 1945 (May 7)[KeywordID: 13926]
0.01: transfer conditions[KeywordID: 12161]
0.01: mass murder awareness[KeywordID: 10698]
0.01: prisoner tattoos[KeywordID: 15180]
0.01: extended family members[KeywordID: 13819]
0.01: transfer procedures[KeywordID: 22647]
0.0: forced marches[KeywordID: 10632]
0.0: loved ones' fates[KeywordID: 12126]
0.0: Czechoslovakia 1945 (May 10) - 1948 (February 19)[KeywordID: 7476]
