In [None]:
%pip install spacy
%pip install vaderSentiment
%pip install pyarrow
%pip install torch
%pip install nltk
%pip install textblob
%pip install -U textblob-de
%pip install transformers
!{sys.executable} -m spacy download de_core_news_lg
!{sys.executable} -m spacy download de_core_news_md

In [22]:
import os
import re
import pandas as pd
from pyarrow import feather
import numpy as np

#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sys
sys.path.append("GERVader")
from vaderSentimentGER import SentimentIntensityAnalyzer # -- https://github.com/KarstenAMF/GerVADER
from nltk.tag import pos_tag
import sys
import nltk
#nltk.download("vader_lexicon")

from scipy.special import softmax
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

# GERVader

In [23]:
pd.set_option('display.max_colwidth', None)
post_data = pd.read_feather('reddit_res/posts.ftr')
comment_data = pd.read_feather('reddit_res/comments.ftr')

In [24]:
def perform_sentiment_analysis_vader(data):
    data_temp = data.copy()
    sid = SentimentIntensityAnalyzer()
    data_temp["Negative"] = data_temp["text"].apply(lambda x:sid.polarity_scores(x)["neg"])
    data_temp["Positive"] = data_temp["text"].apply(lambda x:sid.polarity_scores(x)["pos"])
    data_temp["Neutral"] = data_temp["text"].apply(lambda x:sid.polarity_scores(x)["neu"])
    data_temp["Compound"] = data_temp["text"].apply(lambda x:sid.polarity_scores(x)["compound"])
    data_temp["Overall"] = ['Positive' if score >= 0.05 else 'Negative' if score <= -0.05 else 'Neutral' for score in data_temp["Compound"]]
    return data_temp

In [25]:
post_data_vader = perform_sentiment_analysis_vader(post_data)
comment_data_vader = perform_sentiment_analysis_vader(comment_data)

In [26]:
overall_sentiments_vader = pd.concat([post_data_vader["Overall"].value_counts(), comment_data_vader["Overall"].value_counts()], axis=1)
overall_sentiments_vader.columns = ["Post Data Sentiment", "Comment Data Sentiment"]
overall_sentiments_vader

Unnamed: 0,Post Data Sentiment,Comment Data Sentiment
Positive,38,2929
Negative,6,2174
Neutral,6,2246


# TextblobDE

In [27]:
from textblob_de import TextBlobDE

In [28]:
def perform_sentiment_analysis_blob(data):
    data_temp = data.copy()
    data_temp["Polarity"] = data_temp.apply(lambda x: TextBlobDE(x["text"]).sentiment.polarity, axis=1)
    data_temp["Subjectivity"] = data_temp.apply(lambda x: TextBlobDE(x["text"]).sentiment.subjectivity, axis=1)
    data_temp["Overall"] = ['Negative' if score < 0.0 else 'Neutral' if score == 0.0 else 'Positive' for score in data_temp["Polarity"]]
    return data_temp

In [29]:
post_data_textblob = perform_sentiment_analysis_blob(post_data)
comment_data_textblob = perform_sentiment_analysis_blob(comment_data)

In [30]:
overall_sentiments_blob = pd.concat([post_data_textblob["Overall"].value_counts(), comment_data_textblob["Overall"].value_counts()], axis=1)
overall_sentiments_blob.columns = ["Post Data Sentiment", "Comment Data Sentiment"]

In [31]:
overall_sentiments_blob

Unnamed: 0,Post Data Sentiment,Comment Data Sentiment
Positive,19,1773
Neutral,16,3761
Negative,15,1815


# Pre-trained GeRiBERTa model

In [32]:
import spacy
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = 'oliverguhr/german-sentiment-bert'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [33]:
def perform_sentiment_analysis(text):
    # Tokenize and truncate the text
    tokens = tokenizer.tokenize(text)
    max_length = tokenizer.model_max_length  # Maximum sequence length for the model
    tokens = tokens[:max_length-2]  # Account for special tokens [CLS] and [SEP]
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    # Convert tokens to input IDs
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Feed the tokenized input to the pre-trained model
    with torch.no_grad():
        logits = model(input_ids)[0]

    scores = softmax(logits)
    sentiment_scores = scores.tolist()
    rounded_scores = [[round(score, 4) if isinstance(score, float) else score for score in sublist] for sublist in sentiment_scores]
    
    # Get the predicted sentiment label
    predicted_label = logits.argmax().item()

    sentiment_labels = ['negative', 'neutral', 'positive']
    sentiment = sentiment_labels[predicted_label]

    return sentiment, rounded_scores

In [34]:
post_data_geriberta = post_data.copy()
comment_data_geriberta = comment_data.copy()

post_data_geriberta[["Sentiment", "Scores"]] = post_data_geriberta["text"].apply(perform_sentiment_analysis).apply(pd.Series)
post_data_geriberta["Sentiment"] = post_data_geriberta["Sentiment"].apply(lambda x: x.capitalize())

comment_data_geriberta[["Sentiment", "Scores"]] = comment_data_geriberta["text"].apply(perform_sentiment_analysis).apply(pd.Series)
comment_data_geriberta["Sentiment"] = comment_data_geriberta["Sentiment"].apply(lambda x: x.capitalize())

Token indices sequence length is longer than the specified maximum sequence length for this model (1039 > 512). Running this sequence through the model will result in indexing errors


In [35]:
overall_sentiments_geriberta = pd.concat([post_data_geriberta["Sentiment"].value_counts(), comment_data_geriberta["Sentiment"].value_counts()], axis=1)
overall_sentiments_geriberta.columns = ["Post Data Sentiment", "Comment Data Sentiment"]

In [36]:
overall_sentiments_geriberta 

Unnamed: 0,Post Data Sentiment,Comment Data Sentiment
Positive,28,3316
Neutral,18,3272
Negative,4,761


In [38]:
from nltk.tokenize import word_tokenize
from collections import Counter

word_counts = Counter()
for corpus in post_data_geriberta["text"]:
    words = word_tokenize(corpus)
    word_counts.update(words)

sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
print("Most frequently used sentiment words:")
for word, count in sorted_words:
    print(word, ":", count)

Most frequently used sentiment words:
wien : 15
linz : 14
studium : 11
russland : 11
österreich : 10
leute : 10
stadt : 10
klimaticket : 9
thema : 9
fragen : 8
stellen : 8
graz : 8
radfahrer : 7
ukraine : 7
spö : 7
fahren : 7
sowieso : 7
radweg : 7
post : 6
gerne : 6
1 : 6
anfang : 6
erfahrungen : 6
x200b : 6
leben : 6
usa : 6
meinung : 6
– : 6
land : 6
regierung : 6
politik : 6
ned : 6
komplett : 6
günstiger : 5
deutschland : 5
klimabonus : 5
2 : 5
woche : 5
generation : 5
energie : 5
grad : 5
opposition : 5
gelesen : 5
misstrauensantrag : 5
straße : 5
ziemlich : 5
finde : 5
probleme : 5
egal : 5
krieg : 5
teilweise : 4
deutschen : 4
einkaufen : 4
gilt : 4
4 : 4
kontakt : 4
bekommen : 4
bekomme : 4
3 : 4
super : 4
straßen : 4
klima : 4
scheinbar : 4
miklleitner : 4
fpö : 4
partner : 4
offenbar : 4
bitte : 4
themen : 4
doof : 4
gruppe : 4
unternehmen : 4
eindruck : 4
auto : 4
anfeindungen : 4
zumindest : 4
fragt : 4
liegen : 4
genau : 4
mistkübeln : 4
fahrrad : 4
10 : 4
letzte : 4
beko

In [45]:
with open("stopwords/stopwords-de.txt", "r") as file:
    additional_stop_words = set(file.read().splitlines())
    
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(post_data_vader['text'])

def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

n_top_words = 10
print_topics(lda, count_vectorizer, n_top_words)


Topic #0:
studium spö wiederverwendbarer regierung graz misstrauensantrag opposition energie günstiger klimaticket

Topic #1:
linz stadt mistkübeln erfahrungen österrechische liegen gackisackerlspender bratislava klimaticket wiederverwendbarer

Topic #2:
russland ukraine komplett usa krieg linz wiederverwendbarer gerne politik land

Topic #3:
radweg radfahrer fahren ned stellen österrechische sowieso stadt deutschland linz

Topic #4:
ausländer eu möglichkeit frage kaufen geographie raus grünen vortrag würds
