In [None]:
import spacy, string, pandas
nlp = spacy.load('de_core_news_sm')

In [None]:
# LOADS DATA
def load(path):

    input_raw = open(path + '.csv', encoding = 'utf-8').read().replace('\"', '').replace('\ufeff', '')
    input_list = input_raw.split('\n')
    input_table = [row.split(';') for row in input_list]
    
    print('input table for ' + path + ': done')
    return input_table

In [None]:
# CREATE AGGREGATION LEVELS
def aggregate(input_table):
    ids = []
    ages = []
    for i in range(0, len(input_table)-1):
        if input_table[i][1] not in ids:
            ids.append(input_table[i][1])
        if input_table[i][2] not in ages:
            ages.append(input_table[i][2])
        
    print('aggregation levels created')
    return ids, ages

In [None]:
# EXTRACT AND TRANSFORM TRANSCRIPTIONS
def transform_first_column(input_table):

    column1 = [row[0] for row in input_table]
    column1_tokenlist = [nlp(row) for row in column1]

    column1_stringlist = []
    for row in column1_tokenlist:
        row_stringlist = [token.text for token in row]
        column1_stringlist.append(row_stringlist)

    for i in range (0, len(input_table)-1):
        input_table[i][0] = column1_stringlist[i]
    print('1st column string conversion: done')
    
    return input_table

In [None]:
# CREATE VOCABULARY
def create_vocabulary(ages, ids, work_table):
    
    # INIT VOCABULARY
    vocabulary = []
    for a in range (0, len(ages)):
        vocabulary.append([ages[a], 0, set()]) 
    for i in range (0, len(ids)):
        vocabulary.append([ids[i], 0, set()])
    vocabulary.append(['all', 0, set()])

    # FILL IN VOCABULARY
    for i in range(0, len(work_table)-1):
    
        # age-level calculation
        for j in range (0, len(ages)):
            if work_table[i][2] == ages[j]:
                for word in work_table[i][0]:
                    if word.strip().lower() not in vocabulary[j][2]:
                        vocabulary[j][2].add(word.strip().lower())
                    vocabulary[j][0] = ages[j]
                    vocabulary[j][1] = len(vocabulary[j][2])
                
        # PERSON-LEVEL CALCULATION         
        for k in range (0, len(ids)):                
            if work_table[i][1] == ids[k]:
                for word in work_table[i][0]:
                    if word.strip().lower() not in vocabulary[len(ages) + k][2]:
                        vocabulary[len(ages) + k][2].add(word.strip().lower())  
                    vocabulary[len(ages) + k][0] = ids[k]
                    vocabulary[len(ages) + k][1] = len(vocabulary[len(ages) + k][2])
                
        # ALL-LEVEL CALCULATION   
        for word in work_table[i][0]:
            if word.strip().lower() not in vocabulary[len(vocabulary)-1][2]:
                vocabulary[len(vocabulary)-1][2].add(word.strip().lower())
            vocabulary[len(vocabulary)-1][0] = 'all'
            vocabulary[len(vocabulary)-1][1] = len(vocabulary[len(vocabulary)-1][2])

    for row in vocabulary:
        print(row[0] + ' size: ' + str(row[1]) + ' len: ' + str(len(row[2])))
        
    return vocabulary

In [None]:
# CALCULATE WORD FREQUENCY
def calculate_word_frequency(vocabulary, work_table):

    # INIT WORD-FREQUENCY DICTIONARY
    word_freq = []
    for v in range(0, len(vocabulary)):
        tmp = []
        tmp.append(vocabulary[v][0])
        tmp.append(dict.fromkeys(vocabulary[v][2], 0))
        word_freq.append(tmp)
    
    words_per_person = []
    for id in range(0, len(ids)):
        words_per_person.append([ids[id], set()])
        

    # FILL IN WORD-FREQUENCY DICTIONARY
    for i in range(0, len(work_table)-1):
   
        words_in_row = set(work_table[i][0])
    
        # age-level calculation
        for j in range (0, len(ages)):
            if work_table[i][2] == ages[j]:
                for word in words_in_row:
                    word_freq[j][1][word.strip().lower()] += 1  
        
        # person-level calculation
        for p in range (0, len(ids)):
            if work_table[i][1] == ids[p]:
                for word in words_in_row:
                    word_freq[len(ages) + p][1][word.strip().lower()] += 1
                    words_per_person[p][1].add(word)
                

        # all-level calculation
        for word in words_in_row:
            word_freq[len(word_freq)-1][1][word.strip().lower()] += 1      
    
    word_freq_person = dict.fromkeys(vocabulary[-1][2], 0)
    print('len' + str(len(word_freq_person)))
    for p in range (0, len(ids)):
        for word, freq in word_freq[len(ages) + p][1].items():
            if freq > 0:
                word_freq_person[word] += 1
        
        
    print('word freq calulation: done')
    return word_freq, word_freq_person

In [None]:
# PRINT INTO FILE
def print_dictionary(word_freq):
    for i in range(0, len(word_freq)):
        path = word_freq[i][0]
        with open('./DICT/' + path + '_dict.csv', 'w') as doc_out:
            for word, freq in word_freq[i][1].items():
                doc_out.write(word + ': ' + str(freq) + '\n')  
    
    print('print to file: done')
    return None          

In [None]:
# PRINT INTO FILE
def print_dictionary_per_person(word_freq_person):
    print('per person')
    with open('./DICT/per_person_dict.csv', 'w') as doc_out:
        for word, freq in word_freq_person.items():
            doc_out.write(word + ': ' + str(freq) + '\n')  
    
    print('print to per person file: done')
    return None          

In [None]:
# TUNERS:
paths = ['_in_v2', '_1norm', '_1Snorm', '_2norm', '_2Snorm']

# EXECUTE:
input_table = load('./IO_YO/all' + paths[4])

ids, ages = aggregate(input_table)
work_table = transform_first_column(input_table)             

vocabulary = create_vocabulary(ages, ids, work_table)
word_freq, word_freq_person = calculate_word_frequency(vocabulary, work_table)
print_dictionary(word_freq)
print_dictionary_per_person(word_freq_person)