In [None]:
import pandas as pd
import numpy as np
import os
import translate

# Dataset 1a: German → English Argumentation Reviews (Thiemo)

1. Train a German GloVE model on these embeddings from column 'review'
2. WEAT Analysis
3. Word Co-occurence analysis

In [None]:
data_path = 'background/'
student_reviews_data = pd.read_excel(data_path + '201901_Studenten Reviews BI2 full.xlsx', header=1, index_col=0)
student_reviews_data

In [None]:
full_review = ''.join(student_reviews_data['review'])
with open("reviews_glove.txt", "w") as text_file:
    text_file.write(full_review)

# Dataset 1b: German Argumentation Reviews (Thiemo)

0. Translate text to English
1. Train an English GloVE model on these embeddings from column 'review'
2. WEAT Analysis
3. Word Co-occurence analysis

benefits: direct comparison to baselines from the original paper  
cons: how much bias can be attributed to the content, and how much to the translator?

In [None]:
import deep_translator
from deep_translator import GoogleTranslator

translator = GoogleTranslator(source='de', target='en')

def translate_5000(text):
    characters = len(text)
    print(characters > 3000)
    translations = []
    for i in np.arange(3000, characters, 3000):
        translation = translator.translate(text[i-3000:i])
        translations.append(translation)
    translations.append(translator.translate(text[:-characters%3000]))
    return ''.join(translations)

student_reviews_data['review'] = student_reviews_data['review'].apply(lambda k: translate_5000(k))

In [None]:
student_reviews_data.to_csv('english_translated_student_reviews.csv')

# Dataset 2: German Annotated Essays

1. Train an German GloVE model on these embeddings from student essays
2. WEAT Analysis
3. Word Co-occurence analysis
4. Analyze using annotations

In [None]:
data_path = 'background/Coprus Coling/Annotations/'
coling_essay_files = [file for file in os.listdir(data_path) if ".txt" in file]
essays = {}
for essay_file in coling_essay_files:
    with open(data_path+essay_file, "r") as myfile:
        data = myfile.read().splitlines()
        essays[int(essay_file.split('.txt')[0])] = data[1]
essays

In [None]:
all_essays = '\n'.join(essays.values())
with open("essays_glove.txt", "w") as text_file:
    text_file.write(all_essays)

# Dataset 3: English Annotated Essays

1. Train an English GloVE model on these embeddings from student essays
2. WEAT Analysis
3. Word Co-occurence analysis
4. Analyze using annotations 

In [None]:
data_path = 'background/ArgumentAnnotatedEssays-2.0/brat-project-final/'
eng_essay_files = [file for file in os.listdir(data_path) if ".txt" in file]
essays = {}
for essay_file in eng_essay_files:
    with open(data_path+essay_file, "r") as myfile:
        data = myfile.read()
        essays[int(essay_file.split('.txt')[0].split('essay')[1])] = data
essays

In [None]:
all_essays = '\n'.join(essays.values())
with open("eng_essays_glove.txt", "w") as text_file:
    text_file.write(all_essays)

# WEAT Co-occurence Analysis

In [None]:
import spacy

In [None]:
!python weat_cooccurrence_analysis.py \
    --data "eng_essays_glove.txt" \
    --output "output/weat_cooccurrence_analysis" \
    --processing_cores 7 \
    --tests 1 2 3 4 5 6 7 8 9 10