In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [71]:
rawdat = pd.concat([pd.read_csv('../data/input/Auschwitz_segments_03112020_1.csv'),
                   pd.read_csv('../data/input/Auschwitz_segments_03112020_2.csv')])
rawdat.shape

(1067267, 12)

In [72]:
biodata = pd.read_csv('../data/input/biodata_birkenau.csv')

In [73]:
# filter only used interviews from biodata_birkenau.csv
relevant_rawdat = rawdat[rawdat.IntCode.isin(biodata['IntCode'])]
relevant_rawdat.shape

(584447, 12)

In [47]:
# create a keyword ID - Label map
kwID_kwLabel_map = relevant_rawdat[['KeywordID', 'KeywordLabel']]
kwID_kwLabel_map.drop_duplicates(inplace=True)
kwID_kwLabel_map.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [48]:
kwID_kwLabel_map

Unnamed: 0,KeywordID,KeywordLabel
0,7601,Auschwitz II-Birkenau (Poland : Death Camp)
1,13310,"Oświęcim (Kraków, Poland)"
2,14226,Poland 1944 (July 22) - 1945 (January 16)
3,12044,camp selections
4,14280,loved ones' separations
...,...,...
2334,83352,Christians
2335,59509,European history
2336,15692,attitudes toward Canada and/or Canadians
2337,62391,attitudes toward socialism and/or socialists


In [21]:
# find kwIds, that are in the first and last segments of relevant interviews
first_kwIDs = []
last_kwIDs = []

for intcode in pd.unique(relevant_rawdat.IntCode):
    interview = relevant_rawdat[relevant_rawdat.IntCode==intcode]
    
    # identify first and alst segments
    # extract KwID
    first_kwID = interview[interview.SegmentNumber == interview.SegmentNumber.min()].KeywordID
    last_kwID = interview[interview.SegmentNumber == interview.SegmentNumber.max()].KeywordID

    first_kwIDs.append(first_kwID.to_numpy())
    last_kwIDs.append(last_kwID.to_numpy())
    
# concatenate
first_kwIDs = np.concatenate(first_kwIDs)
last_kwIDs = np.concatenate(last_kwIDs)

In [49]:
# bincount (histogram topics in both cases)
bc_first = np.bincount(first_kwIDs, minlength=kwID_kwLabel_map.KeywordID.max())
bc_last = np.bincount(last_kwIDs, minlength=kwID_kwLabel_map.KeywordID.max())

In [74]:
# argsort (backwards) to get highest populated states
idx_first = bc_first.argsort()[::-1]
idx_last = bc_last.argsort()[::-1]

# printed number of topics (only printout)
print_top_n = 15

In [76]:
# left is prevalence of a topic, i.e. fraction of first frames that have this topic
# second is topic, followed by keywordID in brackets.
print('first topics')
for idx in idx_first[:print_top_n]:
    print(f'{np.round(bc_first[idx]/bc_first.sum(), 2)}: '+\
          f'{kwID_kwLabel_map[kwID_kwLabel_map.KeywordID == idx].KeywordLabel.values[0]}' +\
          f'[KeywordID: {idx}]'
         )

first topics
0.23: Oświęcim (Kraków, Poland)[KeywordID: 13310]
0.21: Poland 1944[KeywordID: 14233]
0.18: Auschwitz II-Birkenau (Poland : Death Camp)[KeywordID: 7601]
0.05: Auschwitz (Poland : Concentration Camp)(generic)[KeywordID: 7528]
0.04: camp selections[KeywordID: 12044]
0.04: camp first impressions[KeywordID: 10983]
0.02: camp intake procedures[KeywordID: 10853]
0.02: deportation to Auschwitz II-Birkenau (Poland : Death Camp)[KeywordID: 16328]
0.02: loved ones' separations[KeywordID: 14280]
0.02: Poland 1943[KeywordID: 14232]
0.01: Poland 1944 (July 22) - 1945 (January 16)[KeywordID: 14226]
0.01: freight trains[KeywordID: 15774]
0.01: loved ones' final contacts[KeywordID: 11672]
0.01: Mengele, Josef[KeywordID: 4047]
0.0: deportation to Auschwitz (Poland : Concentration Camp)(generic)[KeywordID: 16123]


In [77]:
print('last topics')
for idx in idx_last[:print_top_n]:
    print(f'{np.round(bc_last[idx]/bc_last.sum(), 2)}: '+\
          f'{kwID_kwLabel_map[kwID_kwLabel_map.KeywordID == idx].KeywordLabel.values[0]}' +\
          f'[KeywordID: {idx}]'
         )

last topics
0.25: Oświęcim (Kraków, Poland)[KeywordID: 13310]
0.19: Auschwitz II-Birkenau (Poland : Death Camp)[KeywordID: 7601]
0.19: Poland 1944[KeywordID: 14233]
0.05: Auschwitz (Poland : Concentration Camp)(generic)[KeywordID: 7528]
0.03: transfer from Auschwitz II-Birkenau (Poland : Death Camp)[KeywordID: 16297]
0.02: Poland 1945 (January 1 - May 7)[KeywordID: 16192]
0.01: transfers, means of transport[KeywordID: 15232]
0.01: Germany 1944[KeywordID: 13929]
0.01: Poland 1944 (July 22) - 1945 (January 16)[KeywordID: 14226]
0.01: freight trains[KeywordID: 15774]
0.01: Poland 1943[KeywordID: 14232]
0.01: Germany 1945 (January 1 - May 7)[KeywordID: 13930]
0.01: camp selections[KeywordID: 12044]
0.01: transfer from Auschwitz (Poland : Concentration Camp)(generic)[KeywordID: 16162]
0.01: transfer conditions[KeywordID: 12161]
