In [None]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax
pd.set_option('display.max_columns', None)

In [None]:
reviews = pd.read_csv('reviews.csv')

In [None]:
reviews = reviews[reviews['position'].eq('employee')].reset_index(drop=True)
reviews_sample = reviews.iloc[:20000, :]
performance_reviews = reviews_sample[['uuid', 'type', 'company_uuid', 'company_name', 'image_text', 'positive_text', 'suggestion_text', 
                                      'negative_text']]

In [None]:
# calculate words frequency
def count_freq(text):

    # Initialize a CountVectorizer
    count_vectorizer = CountVectorizer(stop_words=stopwords.words('german'))
    
    # Fit and transform the reviews to get the word count matrix
    count_matrix = count_vectorizer.fit_transform(text)
    
    # Get the feature names (words) from the vectorizer
    feature_names = count_vectorizer.get_feature_names_out()
    
    # Get the word count values for each word in each review
    word_count_values = count_matrix.toarray()
    
    # Create a dictionary to store the word frequency for each review
    word_frequencies_per_review = []
    for row in word_count_values:
        word_frequencies = {feature_names[i]: count for i, count in enumerate(row) if count > 0}
        word_frequencies_per_review.append(word_frequencies)
    
    word_frequencies_per_review = dict(sorted(word_frequencies_per_review[0].items(), key=lambda item: item[1], reverse=True))
    
    return word_frequencies_per_review

#
freq_dict = {}
for column in performance_reviews.columns:
    if column.endswith('_text'):
        text = ' '.join(performance_reviews[column].dropna())
        text = [text]
        freq = count_freq(text)
        freq_dict[column] = freq
# freq_dict['image_text']

# get the keywords list for firm performence
firm_performance = ['firma', 'unternehmen', 'filiale',  'fluktuation', 'leistungen', 'betriebliche', 'branche', 'tarif', 'überdurchschnittlich',
                    'unterdurchschnittlich', 'durchschnittlich', 'umwelt', 'umweltbewusstsein', 'sozialbewusstsein', 'nachhaltigkeit', 'wert', 
                    'ausstattung', 'technik', 'equipment', 'technische', 'software', 'hardware', 'image', 'bewertungen', 'qualität','performance',
                    'wachstum', 'umsatz','gewinn', 'wettbewerb', 'innovation','tendenz', 'fallend', 'steigend']

In [None]:
# label all the cells related to keywords list
def label_related_cells(df, keywords_list):
    for column in df.columns:
        if column.endswith('_text'):
            # Create a new label column
            df[column + '_labels'] = df[column].apply(
                lambda x: any(
                    any(word.lower().startswith(prefix.lower()) for prefix in keywords_list)
                    for word in str(x).split()
                )
            )
    return df

performance_reviews = label_related_cells(performance_reviews, firm_performance)

In [None]:
# NLP Model

# Load the BERT model for German sentiment analysis
MODEL_NAME = "oliverguhr/german-sentiment-bert"
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

def get_sentiment_probabilities(text):
    if not isinstance(text, str):  # Check for NaN or non-string values
        return None, None

    inputs = tokenizer.encode_plus(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = softmax(logits, dim=1)
    negative_prob, _, positive_prob = probs[0]
    total = negative_prob + positive_prob
    return negative_prob.item()/total.item(), positive_prob.item()/total.item()

def analyze_sentiment(df):
    for column in df.columns:
        if column.endswith('_text_labels'):
            # Identify the corresponding text column
            text_column = column.replace('_labels', '')
            
            for index, label in pd.DataFrame(df[column]).iterrows():
                if label.all():  # If the label is True, perform sentiment analysis
                    # Get the text content from the corresponding text column
                    text_content = df.at[index, text_column]
                    
                    # Perform sentiment analysis using the BERT model
                    if text_content and isinstance(text_content, str):
                        negative_prob, positive_prob = get_sentiment_probabilities(text_content)
                        
                        # Assign sentiment label based on the higher probability
                        sentiment = 'positive' if positive_prob > negative_prob else 'negative'
                        
                        df.at[index, text_column + '_sentiment'] = sentiment
                        df.at[index, text_column + '_negative_prob'] = negative_prob
                        df.at[index, text_column + '_positive_prob'] = positive_prob
    return df

In [None]:
# Run the function
performance_reviews = analyze_sentiment(performance_reviews)
performance_reviews

In [None]:
def calculate_final_sentiment(df):
    # Initialize new columns for average probabilities and final sentiment
    df['average_negative_prob'] = np.nan
    df['average_positive_prob'] = np.nan
    df['final_sentiment'] = np.nan

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # Extract columns that end with '_negative_prob' and '_positive_prob'
        negative_prob_columns = [col for col in df.columns if col.endswith('_negative_prob')]
        positive_prob_columns = [col for col in df.columns if col.endswith('_positive_prob')]
        
        # Calculate the average negative and positive probabilities for each row
        average_negative_prob = np.nanmean(row[negative_prob_columns])
        average_positive_prob = np.nanmean(row[positive_prob_columns])
        
        # Assign the calculated averages to the new columns
        df.at[index, 'average_negative_prob'] = average_negative_prob
        df.at[index, 'average_positive_prob'] = average_positive_prob
        
        # Determine the final sentiment based on which average probability is larger
        if np.isnan(average_negative_prob) and np.isnan(average_positive_prob):
            # If both averages are NaN, assign NaN as the final sentiment
            df.at[index, 'final_sentiment'] = np.nan
        elif average_negative_prob > average_positive_prob:
            df.at[index, 'final_sentiment'] = 'negative'
        else:
            df.at[index, 'final_sentiment'] = 'positive'
    
    return df

performance_reviews = calculate_final_sentiment(performance_reviews)

# Display the resulting DataFrame

performance_reviews

In [None]:
#test