## Data Preperation

Get vectors from data using the Natural language tool kit nltk

In [12]:
import nltk
import json

The stopwords library includes all unimportant words that we don't want to take into account

In [13]:
from nltk.corpus import stopwords
sw_de = stopwords.words('german')
# the more recent articles contain english text
sw_en = stopwords.words('english') 

The stemmer reduces words to their word stem, so that we don't count each iteration of a word as different words

In [3]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("german")

Get text data in list format

In [15]:
import re
def prepare_data(filename):
    with open(filename, "r") as read_file:
        data = json.load(read_file)
    wordlist = []
    for elements in data['matches']:
        #we don't want to vectorize the release date 
        if 'release_date' in elements:
            del elements['release_date']
        #sometimes teaser title and title have the same content. We don't want to count them double
        if 'title' in elements and 'teaser_title' in elements:
            if elements['title'] == elements['teaser_title']:
                del elements['teaser_title']
        #sometimes subtitle and teaser text have the same content. We don't want to count them double
        if 'subtitle' in elements and 'teaser_text' in elements:
            if elements['subtitle'] == elements['teaser_text']:
                del elements['teaser_text']
        for subelement in elements:
            val = elements[subelement]    
            #we want to remove numbers, since they won't be particulary telling without context
            val = re.sub(r'[0-9\.]+', '', val)
            #split sentence into list of words, remove punctuation 
            words = re.sub("[^\w]", " ",  val).split()
            #remove all words that are stopwords 
            without_sw = [w for w in words if w.lower() not in sw_de]
            #replace words with their stem
            stemmed = [stemmer.stem(w) for w in without_sw]
            stemmed = [w for w in stemmed if w not in sw_de]
            stemmed = [w for w in stemmed if w not in sw_en]
            #remove single letters
            words = [w for w in stemmed if len(w)>1]
            wordlist.append(words)

    return wordlist

## Train Models

To get the corresponding vectors, we use gensims Word2Vec library

In [2]:
from gensim.models import Word2Vec

In [17]:
#1946 to 1960
wordlist = prepare_data("1946_1960.json")
model_1946_1960 = Word2Vec(wordlist, min_count = )
vecs_1946_1960 = model_1946_1960.wv
vecs_1946_1960.save('vecs_1946_1960.kv')

In [19]:
#1960 to 1990
wordlist = prepare_data("1960_1990.json")
model_1960_1990 = Word2Vec(wordlist, min_count = 5)
vecs_1960_1990 = model_1960_1990.wv
vecs_1960_1990.save('vecs_1960_1990.kv')

In [25]:
#1990 to 2000
wordlist = prepare_data("1990_2000.json")
model_1990_2000 = Word2Vec(wordlist, min_count = 5)
vecs_1990_2000 = model_1990_2000.wv
vecs_1990_2000.save('vecs_1990_2000.kv')

In [28]:
#2000 to 2010
wordlist = prepare_data("2000_2010.json")
model_2000_2010 = Word2Vec(wordlist, min_count = 5)
vecs_2000_2010 = model_2000_2010.wv
vecs_2000_2010.save('vecs_2000_2010.kv')

In [30]:
#2010 to 2021
wordlist = prepare_data("2010_2021.json")
model_2010_2021 = Word2Vec(wordlist, min_count = 5)
vecs_2010_2021 = model_2010_2021.wv
vecs_2010_2021.save('vecs_2010_2021.kv')

In [1]:
from gensim.models import KeyedVectors

reloaded_word_vectors = KeyedVectors.load('vecs_2010_2021.kv')

In [2]:
reloaded_word_vectors.most_similar('mann')

[('madch', 0.7156395316123962),
 ('jungfraulich', 0.6510637402534485),
 ('jahrig', 0.6335220336914062),
 ('vergewaltigt', 0.6326545476913452),
 ('mutt', 0.6281175017356873),
 ('frau', 0.595895528793335),
 ('begrapscht', 0.5917845964431763),
 ('vat', 0.5873454213142395),
 ('mannlich', 0.5871764421463013),
 ('studentin', 0.5830567479133606)]

## Save vectors to csv files

In [22]:
import pandas as pd

In [23]:
df = pd.DataFrame(data = vecs_1946_1960.vectors)
df['word']=vecs_1946_1960.index_to_key 
df=df.set_index('word')
df.to_csv('vecs_1946_1960.csv')

In [24]:
df = pd.DataFrame(data = vecs_1960_1990.vectors)
df['word']=vecs_1960_1990.index_to_key 
df=df.set_index('word')
df.to_csv('vecs_1960_1990.csv')

In [26]:
df = pd.DataFrame(data = vecs_1990_2000.vectors)
df['word']=vecs_1990_2000.index_to_key 
df=df.set_index('word')
df.to_csv('vecs_1990_2000.csv')

In [29]:
df = pd.DataFrame(data = vecs_2000_2010.vectors)
df['word']=vecs_2000_2010.index_to_key 
df=df.set_index('word')
df.to_csv('vecs_2000_2010.csv')

In [31]:
df = pd.DataFrame(data = vecs_2010_2021.vectors)
df['word']=vecs_2010_2021.index_to_key 
df=df.set_index('word')
df.to_csv('vecs_2010_2021.csv')

### Connotated words

In [46]:
negative_eigenschaften ='affektiert gekünstelt geziert aggressiv angeberisch anmaßend arglistig argwöhnisch arrogant aufdringlich herablassend überheblich eingebildet boshaft cholerisch reizbar jähzornig dekadent hetzerisch dreist egozentrisch eifersüchtig einfältig eingebildet eitel elitär fies garstig großspurig herablassend hinterhältig hochmütig hysterisch ignorant intrigant langweilig manipulativ narzisstisch neurotisch oberflächlich protzig reserviert resigniert rücksichtslos scheinheilig schlampig selbstgefällig selbstgerecht selbstsüchtig selbstverliebt skrupellos spießig stur überheblich unnahbar unsozial verbohrt verlogen verschlagen versnobt willkürlich zynisch'

In [47]:
positive_eigenschaften = 'Liebevoll Freundlich Bescheiden Respektvoll Aufrecht Sorgfältig Nett Ambitioniert Zielbewusst Ehrlich Verlässlich Gerecht Mutig Warmherzig Intelligent Sympathisch Geduldig Beständig Ordentlich Präzise Hilfsbereit Kommunikativ Selbstbewusst Mutig Vernünftig Flexibel Ehrgeizig Verantwortlich Demütig Friedliebend Sensibel Aktiv Ausgeglichen Witzig Angenehm Attraktiv Anpassungsfähig Arbeitsam Empfindlich Sachlich Heroisch Unterhaltsam Schön Kreativ Erfinderisch Verantwortungsbewusst Schlau Gutmütig Lustig Bescheiden'

In [48]:
negative_eigenschaften = negative_eigenschaften.split()
stemmed_NEG = [stemmer.stem(w) for w in negative_eigenschaften]

In [49]:
positive_eigenschaften = positive_eigenschaften.split()
stemmed_POS = [stemmer.stem(w) for w in positive_eigenschaften]

In [50]:
df_adjectives = pd.DataFrame()
df_adjectives['pos'] = stemmed_POS 
df_adjectives['neg'] = stemmed_NEG[:50]
df_adjectives.to_csv('adjectives.csv')