In [None]:
def preprocessing(series, lang, pontuation = False, lemmatize=False, stemmer=False, stop_words = False):
    processed_corpus = []

    for i in tqdm(range(len(series))):
        text = series[i]
        
        #LOWERCASE
        text = text.lower()
        
        #REMOVE TAGS
        text = BeautifulSoup(text).get_text()
        
        #REMOVE PONTUATION
        if pontuation=='True':
            text = re.sub(r'[\s+\.\!\/_,$%^*(+\"\']+|[+——！，？、~@#￥%……&*（）:：；《）《》“”()»〔〕-]+', ' ', text)
        
        #LEMMATIZATION
        if lemmatize=='True':
            if lang == 'eng':
                lemma = WordNetLemmatizer()
                text = ' '.join(lemma.lemmatize(word) for word in text.split())
            #elif lang == 'finnish':
            #    temp = []
            #    v = libvoikko.Voikko(u"fi")
            #    for word in text:
            #        voikko_dict = v.analyze(word)
            #        try:
            #            t = voikko_dict[0]['BASEFORM']
            #            temp.append(t)
            #       except:
            #            temp.append(word)
            #            continue
            #    text = temp.copy()
            else:
                pass
         
        #STEMMER
        if stemmer=='True':
            if lang == 'finnish':
                snowball_stemmer = SnowballStemmer('finnish')
                text = ' '.join(snowball_stemmer.stem(word) for word in text.split())
            elif lang == 'eng':
                snowball_stemmer = SnowballStemmer('english')
                text = ' '.join(snowball_stemmer.stem(word) for word in text.split())
            else:
                pass

        #REMOVE STOP WORDS
        if stop_words=='True':
            if lang == 'finnish':
                stop_words = get_stop_words('finnish')
                text = ' '.join([word for word in text.split() if word not in stop_words])
            elif lang == 'zh':
                stop_words = set(line.strip() for line in open('stopwords-zh.txt', encoding='utf8'))
                text = ' '.join([word for word in text.split() if word not in stop_words])
            else:
                stop_words = set(stopwords.words('english')).union(STOP_WORDS)
                text = ' '.join([word for word in text.split() if word not in stop_words])

        # For en-zh language pair use jieba for words cut
        if lang == 'zh':
            text = jieba.cut_for_search(text)
        else:
            # Convert to list from string
            text = text.split()
            

        text = " ".join(text)
        processed_corpus.append(text)
    return processed_corpus

In [None]:
def prep(ref_trans, pontuation = True, lemmatize=True, stemmer=True, stop_words = True):
    
    if ref_trans == 'referece':
        a,b,c = references_cs, references_de, scores_en_fi['reference']
        d,e,f = scores_en_zh['reference'], references_ru, references_zh
    elif ref_trans == 'translation':
        a,b,c = translation_cs, translation_de, scores_en_fi['translation']
        d,e,f = scores_en_zh['translation'], translation_ru, translation_zh
        
    ref_trans_cs = preprocessing(a,'eng', pontuation = pontuation, 
                                  lemmatize=lemmatize, stemmer=stemmer, stop_words = stop_words)
    
    ref_trans_de = preprocessing(b,'eng', pontuation = pontuation, 
                                  lemmatize=lemmatize, stemmer=stemmer, stop_words = stop_words)
    
    ref_trans_en_fi = preprocessing(c,'finnish', pontuation = pontuation, 
                                  lemmatize=lemmatize, stemmer=stemmer, stop_words = stop_words)
    
    ref_trans_en_zh = preprocessing(d,'zh', pontuation = pontuation, 
                                  lemmatize=lemmatize, stemmer=stemmer, stop_words = stop_words)
    
    ref_trans_ru = preprocessing(e,'eng', pontuation = pontuation, 
                                  lemmatize=lemmatize, stemmer=stemmer, stop_words = stop_words)
    
    ref_trans_zh = preprocessing(f,'eng', pontuation = pontuation, 
                                  lemmatize=lemmatize, stemmer=stemmer, stop_words = stop_words)
    
    out = ref_trans_cs, ref_trans_de, ref_trans_en_fi, ref_trans_en_zh, ref_trans_ru, ref_trans_zh
    
    return out

In [None]:
# All negative

references_cs, references_de, references_en_fi, references_en_zh,references_ru, references_zh = prep(
    'referece', pontuation = False, lemmatize=False, stemmer=False, stop_words = False)

translation_cs, translation_de, translation_en_fi, translation_en_zh, translation_ru, translation_zh = prep(
    'translation', pontuation = False, lemmatize=False, stemmer=False, stop_words = False)

In [None]:
# Pontuation == True

references_cs, references_de, references_en_fi, references_en_zh,references_ru, references_zh = prep(
    'referece', pontuation = True, lemmatize=False, stemmer=False, stop_words = False)

translation_cs, translation_de, translation_en_fi, translation_en_zh, translation_ru, translation_zh = prep(
    'translation', pontuation = True, lemmatize=False, stemmer=False, stop_words = False)

In [None]:
# Lemmatize == True

references_cs, references_de, references_en_fi, references_en_zh,references_ru, references_zh = prep(
    'referece', pontuation = False, lemmatize=True, stemmer=False, stop_words = False)

translation_cs, translation_de, translation_en_fi, translation_en_zh, translation_ru, translation_zh = prep(
    'translation', pontuation = False, lemmatize=True, stemmer=False, stop_words = False)

In [None]:
# Stemmer == True

references_cs, references_de, references_en_fi, references_en_zh,references_ru, references_zh = prep(
    'referece', pontuation = False, lemmatize=False, stemmer=True, stop_words = False)

translation_cs, translation_de, translation_en_fi, translation_en_zh, translation_ru, translation_zh = prep(
    'translation', pontuation = False, lemmatize=False, stemmer=True, stop_words = False)

In [None]:
# Stop_words == True

references_cs, references_de, references_en_fi, references_en_zh,references_ru, references_zh = prep(
    'referece', pontuation = False, lemmatize=False, stemmer=False, stop_words = True)

translation_cs, translation_de, translation_en_fi, translation_en_zh, translation_ru, translation_zh = prep(
    'translation', pontuation = False, lemmatize=False, stemmer=False, stop_words = True)

In [None]:
# All True

references_cs, references_de, references_en_fi, references_en_zh,references_ru, references_zh = prep(
    'referece', pontuation = True, lemmatize=True, stemmer=True, stop_words = True)

translation_cs, translation_de, translation_en_fi, translation_en_zh, translation_ru, translation_zh = prep(
    'translation', pontuation = False, lemmatize=False, stemmer=False, stop_words = False)