In [1]:
# IMPORT PACKAGES
import spacy, string
nlp = spacy.load('de_core_news_sm')
from spacy.lang.de import German

In [2]:
# LOAD DATA S.T. 1 LINE IN XLSX = 1 DOCUMENT
def load (path):
    data_raw = open(path + '.csv', encoding = 'utf-8').read().replace('\"', '').replace('\ufeff', '')
    data_1row_1string = data_raw.split('\n')
    data_1row_stringlist = [row.split(';') for row in data_1row_1string] #ebbe kell a végén visszatölteni, a.k.a. row[0]-t felülírni
    
    print('data from ' + path + ' is loaded')
    return data_1row_stringlist

In [3]:
# EXTRACT TRANSCRIPTIONS
def extract (data_1row_stringlist):
    column1 = [row[0] for row in data_1row_stringlist]
    column1_1row_tokenlist = [nlp(row) for row in column1]
    print('column extraction is done')
    
    return column1_1row_tokenlist

In [4]:
# normalization, spacy lemmatization
def normalize (column1_1row_tokenlist):
    column1_normalized = []
    for row in column1_1row_tokenlist:
        row_lemmatized = [token.lemma_.lower().replace('ß', 'ss').replace('\'s', '').replace('’s', '') for token in row if (not token.text.isdigit() and token.is_punct == False and len(token) > 1)]
        column1_normalized.append(row_lemmatized)
    print('spacy normalization is done')
    
    return column1_normalized

In [5]:
# optimization with gerTwol
def gertwol_optimize (column1_normalized):

    gertwol_raw = nlp(open('GERTWOL_LIST.csv', encoding = 'utf-8').read().replace('\ufeff', ''))
    gertwol_list = [row.text.split(';') for row in gertwol_raw if row.text != '\n'] 

    lemma_dict = {}
    for i in range(0, len(gertwol_list)-1):    
        row = gertwol_list[i]
        lemma_dict[row[0]] = [row[1]]

    column1_gertwoled = []
    j = 0
    with open('Test.csv', 'w') as test:
        for row in column1_normalized:
            for i in range (0, len(row)-1):
                if (row[i] in lemma_dict.keys() and row[i] != lemma_dict[row[i]][0]):
                    test.write(row[i] + ': ' + lemma_dict[row[i]][0] + '\n')
                    row[i] = lemma_dict[row[i]][0] #returns value
                    j = j + 1
            column1_gertwoled.append(row)       

    print('gertwol optimization is done, nr of changed items: ' + str(j))
    return column1_gertwoled

In [6]:
# REMOVE STOPWORDS
def remove_stopwords (column1):
    column1_nostops = []
    stopwords = open('STOPWORDS.csv', encoding = 'utf-8').read().replace('\ufeff', '').split('\n')

    for row in column1:      
        row_nostops = [word for word in row if word not in stopwords]
        column1_nostops.append(row_nostops) 
       
    print('stopword removal is done')
    return column1_nostops

In [7]:
#insert transcription back to table and print
def reinsert_and_save(data_1row_stringlist, column1_normalized, path):
    i = 0
    for row in column1_normalized:
        data_to_string = ''
        for word in row:
            data_to_string = data_to_string + word + ' ' 
        data_1row_stringlist[i][0] = data_to_string
        i = i + 1
   
    with open(path + 'norm.csv', 'w', encoding = 'utf-8') as doc_out:
        for row in data_1row_stringlist:
            for cell in row:
                doc_out.write(cell + ';')
            doc_out.write('\n')
    print(path + ' output is saved')

    return doc_out

In [8]:
# EXECUTE
paths = ['./IO_YO/all', './IO_YO/Test']

doc_in = load(paths[0] + '_in_v2')
doc_norm = normalize(extract(doc_in))
doc_gert = gertwol_optimize (doc_norm) 

doc_out = reinsert_and_save(doc_in, doc_norm, paths[0] + '_1')
doc_out = reinsert_and_save(doc_in, doc_gert, paths[0] + '_2')    
doc_out = reinsert_and_save(doc_in, remove_stopwords (doc_norm), paths[0] + '_1S')    
doc_out = reinsert_and_save(doc_in, remove_stopwords (doc_gert), paths[0] + '_2S') 

data from ./IO_YO/all_in_v2 is loaded
column extraction is done
spacy normalization is done
gertwol optimization is done, nr of changed items: 8091
./IO_YO/all_1 output is saved
./IO_YO/all_2 output is saved
stopword removal is done
./IO_YO/all_1S output is saved
stopword removal is done
./IO_YO/all_2S output is saved
