In [1]:
import spacy, string, pandas
nlp = spacy.load('de_core_news_sm')

In [9]:
# LOADS DATA
def load(path):

    input_raw = open(path + '.csv', encoding = 'latin-1').read().replace('\"', '').replace('\ufeff', '')
    input_list = input_raw.split('\n')
    input_table = [row.split(';') for row in input_list]
    
    print('input table for ' + path + ': done')
    return input_table

In [3]:
# CREATE AGGREGATION LEVELS
def aggregate(input_table):
    ids = []
    ages = []
    for i in range(0, len(input_table)-1):
        if input_table[i][1] not in ids:
            ids.append(input_table[i][1])
        if input_table[i][2] not in ages:
            ages.append(input_table[i][2])
        
    print('aggregation levels created')
    return ids, ages

In [4]:
# EXTRACT AND TRANSFORM TRANSCRIPTIONS
def transform_first_column(input_table):

    column1 = [row[0] for row in input_table]
    column1_tokenlist = [nlp(row) for row in column1]

    column1_stringlist = []
    for row in column1_tokenlist:
        row_stringlist = [token.text for token in row]
        column1_stringlist.append(row_stringlist)

    for i in range (0, len(input_table)-1):
        input_table[i][0] = column1_stringlist[i]
    print('1st column string conversion: done')
    
    return input_table

In [5]:
# CREATE VOCABULARY
def create_vocabulary(ages, ids, work_table):
    
    # INIT VOCABULARY
    vocabulary = []
    for a in range (0, len(ages)):
        vocabulary.append([ages[a], 0, set()]) 
    for i in range (0, len(ids)):
        vocabulary.append([ids[i], 0, set()])
    vocabulary.append(['all', 0, set()])

    # FILL IN VOCABULARY
    for i in range(0, len(work_table)-1):
    
        # age-level calculation
        for j in range (0, len(ages)):
            if work_table[i][2] == ages[j]:
                for word in work_table[i][0]:
                    if word.strip().lower() not in vocabulary[j][2]:
                        vocabulary[j][2].add(word.strip().lower())
                    vocabulary[j][0] = ages[j]
                    vocabulary[j][1] = len(vocabulary[j][2])
                
        # PERSON-LEVEL CALCULATION         
        for k in range (0, len(ids)):                
            if work_table[i][1] == ids[k]:
                for word in work_table[i][0]:
                    if word.strip().lower() not in vocabulary[len(ages) + k][2]:
                        vocabulary[len(ages) + k][2].add(word.strip().lower())  
                    vocabulary[len(ages) + k][0] = ids[k]
                    vocabulary[len(ages) + k][1] = len(vocabulary[len(ages) + k][2])
                
        # ALL-LEVEL CALCULATION   
        for word in work_table[i][0]:
            if word.strip().lower() not in vocabulary[len(vocabulary)-1][2]:
                vocabulary[len(vocabulary)-1][2].add(word.strip().lower())
            vocabulary[len(vocabulary)-1][0] = 'all'
            vocabulary[len(vocabulary)-1][1] = len(vocabulary[len(vocabulary)-1][2])

    for row in vocabulary:
        print(row[0] + ' size: ' + str(row[1]) + ' len: ' + str(len(row[2])))
        
    return vocabulary

In [6]:
# CALCULATE WORD FREQUENCY
def calculate_word_frequency(vocabulary, work_table):

    # INIT WORD-FREQUENCY DICTIONARY
    word_freq = []
    for v in range(0, len(vocabulary)):
        tmp = []
        tmp.append(vocabulary[v][0])
        tmp.append(dict.fromkeys(vocabulary[v][2], 0))
        word_freq.append(tmp)

    # FILL IN WORD-FREQUENCY DICTIONARY
    for i in range(0, len(work_table)-1):
   
        # age-level calculation
        for j in range (0, len(ages)):
            if work_table[i][2] == ages[j]:
                for word in work_table[i][0]:
                    word_freq[j][1][word.strip().lower()] += 1  

        # all-level calculation  
        for word in work_table[i][0]:
            word_freq[len(word_freq)-1][1][word.strip().lower()] += 1      
    
    print('word freq calulation: done')
    return word_freq

In [7]:
# PRINT INTO FILE
def print_dictionary(word_freq):
    for i in range(0, len(word_freq)):
        path = word_freq[i][0]
        with open('./DICT/' + path + '_dict.csv', 'w') as doc_out:
            for word, freq in word_freq[i][1].items():
                doc_out.write(word + ': ' + str(freq) + '\n')  
    
    print('print to file: done')
    return word_freq          

In [10]:
# TUNERS:
paths = ['_in_v2', '_1norm', '_1Snorm', '_2norm', '_2Snorm']

# EXECUTE:
input_table = load('./IO_YO/all' + paths[4])
ids, ages = aggregate(input_table)
work_table = transform_first_column(input_table)             

vocabulary = create_vocabulary(ages, ids, work_table)
word_freq = calculate_word_frequency(vocabulary, work_table)
#print_dictionary(word_freq)

input table for ./IO_YO/all_2Snorm: done
aggregation levels created
1st column string conversion: done
y size: 4630 len: 4630
o size: 4079 len: 4079
p1 size: 69 len: 69
p2 size: 291 len: 291
p3 size: 141 len: 141
p4 size: 270 len: 270
p5 size: 189 len: 189
p6 size: 186 len: 186
p7 size: 120 len: 120
p8 size: 232 len: 232
p9 size: 319 len: 319
p11 size: 357 len: 357
p12 size: 234 len: 234
p13 size: 110 len: 110
p14 size: 96 len: 96
p15 size: 342 len: 342
p16 size: 242 len: 242
p17 size: 154 len: 154
p18 size: 250 len: 250
p19 size: 189 len: 189
p20 size: 113 len: 113
p21 size: 210 len: 210
p22 size: 273 len: 273
p24 size: 103 len: 103
p25 size: 57 len: 57
p27 size: 322 len: 322
p28 size: 294 len: 294
p29 size: 414 len: 414
p30 size: 201 len: 201
p32 size: 152 len: 152
p33 size: 122 len: 122
p34 size: 473 len: 473
p38 size: 129 len: 129
p40 size: 243 len: 243
p56 size: 176 len: 176
p57 size: 128 len: 128
p58 size: 15 len: 15
p59 size: 181 len: 181
p60 size: 161 len: 161
p61 size: 182 len

[['y',
  {'': 366,
   'mutter': 12,
   'systole': 2,
   'mahnung': 1,
   'dings-sack-bums': 1,
   'autistisch': 1,
   'wurst': 8,
   'bällchen': 3,
   'verrückt': 1,
   'material': 1,
   'grenzerlös': 3,
   'köpfen': 2,
   'mireille': 1,
   'phantasievoll': 1,
   'sagst': 1,
   'verbauen': 1,
   'sissi': 1,
   'gefarmed': 1,
   'finanzieren': 2,
   'vorstellung': 3,
   'sparen': 6,
   'optisch': 3,
   'neu': 42,
   'badezimmer': 3,
   'sprüchlein': 1,
   'fair': 2,
   'verstecken': 1,
   'verpeilen': 1,
   'muss': 2,
   'raufziehen': 1,
   'schrauben': 1,
   'montieren': 1,
   'waffengebrauch': 1,
   'gehängte': 1,
   'frontalunterricht': 1,
   'autobahn': 3,
   'fastnacht': 1,
   'telefonieren': 4,
   'menge': 1,
   'streng': 3,
   'authentisch': 2,
   'solarpannel': 1,
   'spaghetti': 3,
   'reinschnipseln': 1,
   'sponsor': 1,
   'doppelgänger': 3,
   'möglich': 4,
   'naherholungsqualität': 1,
   'glätteisen': 1,
   'verhaften': 1,
   'erfolgreich': 2,
   'sitzen': 12,
   'engelber