In [1]:
# IMPORT PACKAGES
# Load German tokenizer, tagger, parser, NER and word vectors, source: https://spacy.io/models/de
import spacy, string
nlp = spacy.load('de_core_news_sm')
from spacy.lang.de import German
from spacy.lang.de.stop_words import STOP_WORDS

# regular expressions, source: https://docs.python.org/3.5/library/re.html
import re, os

# EACH FUNCTION IS RESPONSIBLE FOR A SINGLE STEP IN THE NORMALIZATION PROCESS.
# EACH FUNCTION HAS A SAVE OPTION THAT ALWAYS OVERWRITES THE SAME FILE AND SERVES PURELY AS A CHECK-POINT.

In [2]:
# LOADING DATASET
def load_data (path): 
    try:
        data_raw = open(path + '_in.csv', encoding = 'latin-1').read().replace('\"', '')
    except FileNotFoundError:
        print('file ' + path + '_in.csv not found')
        
    print('data load is done')
    return data_raw

In [3]:
# TOKENIZATION AND POS-TAGGING
def tokenize (data, save):
    data_doc = nlp(data)    

    if (save):
        save_checkpoint(data_doc, 'tok')

    print('tokenization and pos-tagging are done')
    return data_doc

In [4]:
# REMOVE PUNCTUATION
def remove_punctuation (data, save):
    data_nopunct = [token for token in data if (token.is_punct == False and token.text.lower() != 'so.')] 
    
    if (save):
        save_checkpoint(data_nopunct, 'pun') 
 
    print('punctuation removal is done')
    return data_nopunct

In [5]:
# CASE FOLDING - MAPPING ALL WORDS TO LOWER CASE
def lowerize (data, save):
    data_lower = [token.text.lower() for token in data]
    
    if (save):
        save_checkpoint(data_lower, 'low') 

    print('case folding is done')
    return data_lower

In [6]:
# REMOVE OPERATOR LABELS, NUMBERS AND OTHER NOISE FROM TEXT
def reduce_noise (data, save):
    complex_white_space = re.compile('^\s*\n+\s*$')
    
    # removes labels added by operators
    noise = {'xxxx', 'xxx', 'xx', 'yyyy', 'yyy', 'zzz', '*'}
    data_noxy = [word.replace('xxx-', '').replace('xxx', '').replace('.yyy', '').replace('yyy', '').replace('*', '') for word in data if word.strip() not in noise]

    #removes 'ß', ''s'
    data_ss = [word.replace('ß', 'ss').replace('\'s', '').replace('’s', '') for word in data_noxy]
    
    # deletes numbers written as numbers
    data_num = [word for word in data_ss if not word.isdigit()]

    # ignore words of a single character
    data_nsc = [word for word in data_num if (len(word) >= 2 or complex_white_space.match(word))]
    
    data_clean = data_nsc

    if (save):
        save_checkpoint(data_clean, 'noi') 

    print('noise reduction is done')
    return data_clean

In [7]:
# LEMMATIZATION
def lemmatize (data, save): 
    data_t = string_to_token(data)
    data_lemmatized = [token.lemma_.replace('ß', 'ss') for token in data_t]
    
    if (save):
        save_checkpoint(data_lemmatized, 'lem') 

    print('lemmatization is done')
    return data_lemmatized

In [8]:
# EXTEND STOPWORD LIST, REMOVE STOPWORDS
def remove_stopwords (data, save):
    MY_STOP_WORDS = nlp(open('MY_STOP_WORDS.txt').read())
    for token in MY_STOP_WORDS:
        if token not in STOP_WORDS:
            STOP_WORDS.add(token.text)
        
    data_nostops = [word for word in data if word not in STOP_WORDS]
    
    if (save):
        save_checkpoint(data_nostops, 'sto')  

    print('stop word removal is done')
    return data_nostops

In [9]:
# EXTEND STOPWORD LIST, REMOVE STOPWORDS
def remove_stopwords2 (data, save):
    MY_STOP_WORDS = nlp(open('MY_STOP_WORDS.txt').read())
    ja = ['jajajajaja','jajajaja','jajaja','jaja','jaaa','jaa','jja','naja','tja']
    nein = ['neinnein','neein','neine','nein','nei']
    laugh = ['hahaha','haha','hihihi','hihi','hehehe','hehe','hohoho','hoho']
    hurrah = ['yay', 'yeah', 'ey']
    bla = ['blablablablabla','blablablabla','blablabla']
    hm = ['hm', 'hmm', 'mmh', 'mmmm', 'mmm', 'mm']
    mhm = ['mhh','mh']
    ah = ['aaah','aah','ahh','ach','ooh','ohh','oh','uh']
    öhm = ['eh', 'ähm', 'äh', 'ööh', 'öh']
    hä = ['ha', 'he', 'hää', 'häh', 'hö']

    for word in data:
        word.replace('naja', 'na ja'
                ).replace('ahja', 'ah ja'
                         ).replace('ahso', 'ah so'
                                  ).replace('okay', 'ok'
                                           ).replace('ahaa', 'aha'
                                                    ).replace('whoa', 'wow'
                                                             ).replace('nöö', 'nö'
                                                                      ).replace('jöö', 'jö'
                                                                               ).replace('hey', 'hei'
                                                                                        ).replace('oups', 'ups'
                                                                                                ).replace('eieiei', 'eiei')

        if word in ja:
            word == 'ja'
        elif word in nein:
            word == 'nein'
        elif word in laugh:
            word == 'haha'
        elif word in hurrah:
            word == 'yey'
        elif word in bla:
            word == 'blabla'
        elif word in hm:
            word == 'hm'
        elif word in mhm:
            word == 'mhm'
        elif word in ah:
            word == 'ah'
        elif word in öhm:
            word == 'öhm'
        elif word in hä:
            word == 'hä'
        else:
            continue
    
    data_nostops = data
    
    if (save):
        save_checkpoint(data_nostops, 'sto')  
   
    print('stop word removal is done')
    return data_nostops

In [10]:
# RECONVERT STRINGS TO TOKENS - HELPER METHOD
def string_to_token (data):
    data_s = ''
    for word in data:
        data_s = data_s + word + ' '
    data_t = nlp(data_s)

    return data_t

In [11]:
# CREATE CHECKPOINT - HELPER METHOD
def save_checkpoint(data, step):
    with open(path + '_' + step + '.csv', 'w', encoding = 'latin-1') as doc:
        
        if isinstance(data[0], str):
            for element in data:
                doc.write(element + ' ') 
        else:  
            for element in data:
                doc.write(element.text + ' ') 
            
    return data

In [12]:
# SAVE OUTPUT INTO FILE
def save_output (data, path, print_type):
    white_space = re.compile('^\s+$')
    complex_white_space = re.compile('^\s*\n+\s*$')
    
    with open(path + '_norm.csv', 'w', encoding = 'latin-1') as doc_out:

        # save doc as running text
        if print_type == 'flow':
            for word in data: 
                if white_space.match(word):
                    continue
                else:
                    doc_out.write(word + ' ') 
            print('output is saved')

        # save doc as a list of words    
        elif print_type == 'list':
            for word in data: 
                if white_space.match(word):
                    continue
                doc_out.write(word.strip() + '\n') 
            print('output is saved')

        # check if 1st item is white space
        elif print_type == 'record':
            if complex_white_space.match(data[0]) == None:
                doc_out.write(data[0].strip() + ' ')

            # save doc s.t. skip white space
            for i in range(1, len(data)-2):
                if (complex_white_space.match(data[i]) and complex_white_space.match(data[i+1])):
                    continue
                if complex_white_space.match(data[i]):
                    doc_out.write('\n')
                else:
                    doc_out.write(data[i].strip() + ' ')  
                
            # check if last item is white space        
            if complex_white_space.match(data[len(data)-1]) == None:
                doc_out.write(data[len(data)-1].strip())   
            print('output is saved')
                
        else:
            print('error: invalid print_type in save_output()')

    return doc_out

In [13]:
"""

# TUNERS
save = True
print_type = 'record'

# tuners for group level analysis
paths = ['./IO_YO/young', './IO_YO/old']

# tuners for person level analysis
#participant_ids = nlp(open('PARTICIPANT_ID.txt').read())
#paths = ['./IO_P/' + token.text for token in participant_ids]

# EXECUTE
for path in paths:
    if path is not './IO_P/\n':
        print(path)
        save_output(remove_stopwords
                    (lemmatize
                     (reduce_noise
                      (lowerize
                       (remove_punctuation
                        (tokenize
                         (load_data(
                            path),save), 
                         save), 
                        save), 
                       save), 
                      save), 
                     save), path, print_type)
"""

"\n\n# TUNERS\nsave = True\nprint_type = 'record'\n\n# tuners for group level analysis\npaths = ['./IO_YO/young', './IO_YO/old']\n\n# tuners for person level analysis\n#participant_ids = nlp(open('PARTICIPANT_ID.txt').read())\n#paths = ['./IO_P/' + token.text for token in participant_ids]\n\n# EXECUTE\nfor path in paths:\n    if path is not './IO_P/\n':\n        print(path)\n        save_output(remove_stopwords\n                    (lemmatize\n                     (reduce_noise\n                      (lowerize\n                       (remove_punctuation\n                        (tokenize\n                         (load_data(\n                            path),save), \n                         save), \n                        save), \n                       save), \n                      save), \n                     save), path, print_type)\n"

In [14]:

# EXECUTE - VERSION FOR TESTS
save = True
print_type = 'record'
paths = ['./IO_YO/young', './IO_YO/old', './IO_P/p1-y-ph1', './IO_P/p100-y-ph8']
path =  paths[1]

a = load_data(path)
b = tokenize(a, save)  
c = remove_punctuation(b, save)	
d = lowerize (c, save)	
e = reduce_noise(d, save)
f = lemmatize (e, save)
g = remove_stopwords (f, save)
h = save_output (g, path, print_type)
#print(d)


data load is done
tokenization and pos-tagging are done
punctuation removal is done
case folding is done
noise reduction is done
lemmatization is done
stop word removal is done
output is saved
