**Imports**

In [1]:
# utilities
import re
import numpy as np # linear algebra
import pandas as pd # data processing
import os, json
from glob import glob
import pycountry

# plotting
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

#german sentiment
from germansentiment import SentimentModel

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
#nltk.download('wordnet')
german_stopword = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to /Users/tobi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Read Twitter Data**

In [50]:
# import all files from folder which ends with .json.gz
path_to_json = '/Volumes/SSD_1/Bachelor_Twitter_Data/2022/german-tweet-sample-2022-12'
json_files = glob(os.path.join(path_to_json, '*.json.gz'))

# concat all files to dataframe
df = pd.concat((pd.read_json(f) for f in json_files))

In [23]:
df.head()

Unnamed: 0,twista,type,id,user,created_at,recorded_at,source,retweets,favourites,lang,...,name,screen_name,location,description,url,verified,followers,friends,listed,statuses
0,0.3.4b1,reply,1509429520171945984,9.711414e+17,2022-03-31 07:16:56+00:00,2022-03-31 07:17:10.543530+00:00,"<a href=""http://twitter.com/download/android"" ...",0.0,0,de,...,,,,,,,,,,
1,0.3.4b1,reply,1509429523326062592,1.214901e+18,2022-03-31 07:16:56+00:00,2022-03-31 07:17:10.547527+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",0.0,0,de,...,,,,,,,,,,
2,0.3.4b1,reply,1509429530015977472,1.07262e+18,2022-03-31 07:16:58+00:00,2022-03-31 07:17:10.555420+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",0.0,0,de,...,,,,,,,,,,
3,0.3.4b1,reply,1509429532767526912,1.459965e+18,2022-03-31 07:16:59+00:00,2022-03-31 15:52:49.046375+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1.0,7,de,...,,,,,,,,,,
4,0.3.4b1,reply,1509427159034703872,1.290304e+18,2022-03-31 07:07:33+00:00,2022-03-31 07:48:22.352719+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",3.0,3,de,...,,,,,,,,,,


In [16]:
unique_values = df['type'].unique()

print(unique_values)

['status' 'user' 'retweet' 'reply' 'quote']


In [51]:
#only quotes and replys
df = df[df['type'].isin(['quote', 'reply'])]

# Reset the index of the DataFrame
df.reset_index(drop=True, inplace=True)

In [52]:
# Focus on columns 'created_at', 'hashtags', 'text'
columns_to_move = ('created_at', 'hashtags', 'text')

# Create a new data frame with the selected columns
new_df = df.loc[:, columns_to_move]

In [53]:
new_df.shape

(2203612, 3)

**Filtering for War related Words**

In [54]:
#Filtering with a selection of keywords
keywords = ['Krieg', 'Russland', 'Ukraine', 'Bomben', 'Frieden', 'Angriff', 'Konflikt', 'Waffen', 
            'Unruhe', 'Putin', 'Selenskyj', 'Soldaten', 'Panzer', 'Europa'] 

new_df = new_df[new_df['text'].str.contains('|'.join(keywords), case=True)]

In [55]:
new_df.shape

(73236, 3)

In [12]:
# Reset the index of the DataFrame
new_df.reset_index(drop=True, inplace=True)

In [13]:
new_df.head()

Unnamed: 0,created_at,hashtags,text
0,2022-01-31 07:29:17+00:00,[],@ardmoma @ardmoma wo ist das Versprechen gegen...
1,2022-01-31 07:31:18+00:00,[],Aber es gab mehrfach dokumentierte Ausbrüche n...
2,2022-01-31 07:31:36+00:00,[],"@ernst_klaus Das ist ein Narrativ, das sich do..."
3,2022-01-31 07:33:59+00:00,[],@ProfiWahl @valentina081989 @TheaJDoro @ABaerb...
4,2022-01-31 07:42:05+00:00,"[kidsfreedomday, UnmaskOurKids]","@UllmannMdB ""Stufenweise"", ""diskutieren""... Ko..."


***Preprocessing for Visualization***

In [20]:
# preprocess text
def preprocess(text):
    
    # lower case
    text = text.strip().lower()
    # stopword removal
    text = [word for word in text.split(' ') if word not in german_stopword]
    text=" ".join(text)
    # remove handles
    text = re.sub('@[\w]+', '',text)
    # remove links
    text = re.sub('((www.[^s]+)|(https?://[^s]+))',' ',text)
    # remove punctuations
    punc ='''.?!,:;-_—[](){}'"`~|\/@#$%^&+=*'''
    for i in text:
        if i in punc:
            text = text.replace(i, '')
    # cleaning numbers
    text = re.sub('[0-9]+', '', text)
    # remove new lines
    text = text.replace('\n', ' ')
    
    return text

# remove promotional tweets (with words like 'abonnieren' and 'abonnement')
def no_spam(text):
    if 'abonn' in text:
        text=''
    return text

# Remove non ascii
def demoji(text):
    for word in text:    
    # remove all non ascii characters
        text=text.replace(word, re.sub('[\u2000-\uFFFF🙂-🙏]','', word)).strip() 
    return text

In [21]:
# apply functions
new_df['preprocessed_text']= new_df['text']
new_df['preprocessed_text']= new_df['preprocessed_text'].apply(preprocess)
new_df['preprocessed_text']= new_df['preprocessed_text'].apply(no_spam)
new_df['preprocessed_text']= new_df['preprocessed_text'].apply(demoji)

In [22]:
new_df.head()

Unnamed: 0,created_at,hashtags,text,preprocessed_text
0,2022-04-30 12:08:39+00:00,[],@realTomBohn Was denn? Sind die Argumente ausg...,denn argumente ausgegangen gehen held ukraine ...
1,2022-04-30 12:08:47+00:00,[],@ZeroCovidTH @renzsch @me_for_earth @NiemaMova...,ja russland u westen streiten geostrategische ...
2,2022-04-30 12:09:12+00:00,"[Russland, LumpenPazifisten]","@nikitheblogger Ich schließe mich auch an, abe...",schließe an bedingung russland reisen kreml kr...
3,2022-04-30 12:09:14+00:00,[],"@_FriedrichMerz Lieber Herr März,wir hatten so...",lieber herr märzwir gehofft führender stelle c...
4,2022-04-30 12:09:44+00:00,[],@ReinhardH12 @MargotBla1 mir fehlt es an Worte...,fehlt worten wohl darüber usa einzelnen soldat...


**German Sentiment Model (includes) own PreProcessing**

In [23]:
# Def--------------------------------------------------------
def batch_generator(df, batch_size):
    num_batches = len(df) // batch_size
    remaining_rows = len(df) % batch_size

    for i in range(num_batches):
        start_index = i * batch_size
        end_index = start_index + batch_size
        yield df[start_index:end_index]

    # Yield the remaining rows as the last batch (if any)
    if remaining_rows != 0:
        yield df[-remaining_rows:]

# predict function
def predict(batch):
    predictions, probabilities = model.predict_sentiment(batch['text'], output_probabilities=True)
    return predictions, probabilities

# concatenating the batch outputs into a result dataframe
def concatenate_outputs(df, outputs):
    result_df = pd.concat([df, outputs], ignore_index=True)
    return result_df
# DefEnd-----------------------------------------------------
model = SentimentModel()
batch_size = 500

batch_generator = batch_generator(new_df, batch_size)
# Initialize an empty dataframe to store the results
result_df = pd.DataFrame()

# Iterate over batches and predict using BERT model
for batch in batch_generator:
    predictions, probabilities = predict(batch)
    batch = batch.assign(sentiment=predictions)
    batch = batch.assign(probability=probabilities)
    result_df = concatenate_outputs(result_df, batch)

In [24]:
# Extract the probability value based on the corresponding sentiment in each row
probabilities = result_df.apply(lambda row: [p for s, p in row['probability'] if s.lower() == row['sentiment'].lower()][0], axis=1)

# Add the probabilities as a new column in the result_df DataFrame
result_df['probability_value'] = probabilities

In [25]:
pos = sum(result_df['sentiment'] == 'positive')
neu = sum(result_df['sentiment'] == 'neutral')
neg = sum(result_df['sentiment'] == 'negative')

In [26]:
print(pos, neg, neu)

7298 75049 119784


**Save Result Dataframe to json File**

In [27]:
result_df.to_json('/Volumes/SSD_1/ProcessedData/Twitter_PrePro_2022_04.json', orient='records')