<a href="https://colab.research.google.com/github/PascalBreuer/inl-meet-ir-v2/blob/Pascal/Bert_and_sentiment_dict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [59]:
#pip install transformers

In [60]:
# Muss man in Google Colab einkommentieren
#pip install transformers

In [61]:
# Muss man in Google Colab einkommentieren
#from google.colab import drive
#drive.mount('/content/gdrive')

In [82]:
import re
import sklearn
import string
import nltk
from nltk import tokenize as tk
import transformers as ppb
import pandas as pd
import numpy as np
import torch
import joblib
import re
import os
import time
from enum import Enum
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.pipeline import Pipeline
from transformers import pipeline
from datetime import datetime
from typing import List

nltk.download('stopwords')
nltk.download('punkt')


print(pipeline('sentiment-analysis')('we love you'))


# Man kann auch mehrere hintereinander machen

class ColumnUser:
    def set_column_to_use(self, column_name):
        pass


class ColumnTransformer(ColumnUser):
    def set_column_to_transform(self, column_to_transform):
        pass


class TextToSentenceTransformer(BaseEstimator, TransformerMixin, ColumnTransformer):
    def __init__(self, column_to_transform, new_column_name, filename='/content/gdrive/MyDrive/Praxisprojekt/logs/text_to_sentence_transformer.error'):
        self.column_to_transform = column_to_transform
        self.new_column_name = new_column_name
        self.log_file = filename

    def set_column_to_transform(self, column_to_transform):
        self.column_to_transform = column_to_transform

    def set_column_to_use(self, column_name):
        self.new_column_name = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, data):
        if self.column_to_transform and self.column_to_transform in data.columns:
            return pd.DataFrame({self.new_column_name: self.split_text_in_sentences(data)})
        else:
            self.log_error(
                f'no column with name {self.column_to_transform} in dataframe.\nGiving back original dataframe.')
            return data

    def split_text_in_sentences(self, data) -> List[str]:
        texts = data[self.column_to_transform].tolist()
        sentences = list()
        for text in texts:
            # sentences_in_text = [e + delimiter for e in text.split(delimiter) if e]
            sentences_in_text = [e for e in tk.sent_tokenize(str(text)) if e]
            sentences += sentences_in_text

        print(sentences[0])
        return sentences

    def log_error(self, description):
        with open(self.log_file, 'a', encoding='utf-8') as file:
            file.write('#------------------------------------------------------------------------------------------\n')
            file.write(f'{datetime.now().strftime("%b-%d-%Y %H:%M:%S")}\n')
            file.write(f'\t{description}\n')
            file.write(
                '#------------------------------------------------------------------------------------------\n\n\n')


class BertTransformer(BaseEstimator, TransformerMixin, ColumnUser):

    def __init__(self, column, batchsize=10):
        self.column = column
        self.batch_size = batchsize
        model_class, tokenizer_class, pretrained_weights = (
            ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

        # Load pretrained model/tokenizer

        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.model = model_class.from_pretrained(pretrained_weights)

    def set_column_to_use(self, column_name):
        self.column = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, data):

        features = list()
        row_count = data.shape[0]
        counter = 0
        start_index = 0
        while start_index < row_count:
            d = data.loc[start_index: start_index + self.batch_size]
            feature = self.embedding(d)
            features.extend(feature)
            counter += 1
            start_index += self.batch_size + 1 # das plus 1 kommt daher, dass bei den pandas Dataframes start und end index inklusive sind
            if counter % 10 == 0:
                #print(f'{min(100.0, round(((start_index / row_count) * 100), 2))}% done')
                pass

        return (data, features)

    def embedding(self, data):
        dataList = data[self.column].tolist()
        dataList = list((str(s) for s in dataList))
        tokenized = []

        for s in dataList:
            t = self.tokenizer.encode(s, add_special_tokens=True)
            tokenized.append(t)

        # Padding hinzufügen
        max_len = 0
        for i in tokenized:
            if len(i) > max_len:
                max_len = len(i)

        padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])

        # Maske erstellen, um das Padding bei der Verarbeitung zu filtern
        mask = np.where(padded != 0, 1, 0)
        mask.shape

        # mache padded Array und Maske zu einem Tensor
        # Tensor = mehrdimensionale Matrix mit einheitlichem Datentyp
        input = torch.tensor(padded).to(torch.long).long()
        mask = torch.tensor(mask).to(torch.long).long()

        # gib unser Zeug an BERT
        # no_grad = Angabe zur Simplifikation des Rechenvorgangs
        with torch.no_grad():
            output = self.model(input, attention_mask=mask)

        # nur die erste Spalte auslesen = von BERT geschriebene Kennwerte
        features = output[0][:, 0, :].numpy()

        return features


class PreprocessorTransformer(BaseEstimator, TransformerMixin, ColumnUser):

    def __init__(self, column):
        self.column = column

    def set_column_to_use(self, column_name):
        self.column = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, data_and_features, y=None):
        print('Starting with preprocessing')
        data = data_and_features
        sentences = data[self.column].tolist()
        sentences = list((str(s) for s in sentences))

        # muss vom generator object zurück zur liste gemacht werden
        sentences = list((s.lower() for s in sentences))

        table = str.maketrans('', '', string.punctuation)
        sentences = [s.translate(table) for s in sentences]

        sentences = [re.sub(r'\d+', 'num', s) for s in sentences]

        stopwords = set(nltk.corpus.stopwords.words('english'))
        sentences = [[word for word in s.split() if word not in stopwords] for s in sentences]
        return (sentences)


class SentimentOpinionValueCalculatorSingleValueTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, dict_name):
        self.dict_name = dict_name
        df = pd.read_csv(dict_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_features):
        print('Starting with sentiment value calculation')
        sentiment_opinion_scores = []
        sentences = sentences_and_features
        counter = 0
        count_of_sentences = len(sentences)
        for sentence in sentences:
            word_count = len(sentence)
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score = sentiment_opinion_score + self.value_dict[word]
                sentiment_opinion_score = sentiment_opinion_score / word_count
            else:
                sentiment_opinion_score = 0
            sentiment_opinion_scores.append([sentiment_opinion_score])
            counter += 1
            if counter % 10 == 0:
                #print(f'{min(100.0, round(((counter / count_of_sentences) * 100), 2))}% of sentences done')
                pass

        # TEST
        print(len(sentiment_opinion_scores))
        #print(len(features))
        
        print('starting with classification')
        return sentiment_opinion_scores


class SentimentOpinionValueCounterTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value_file_name):
        df = pd.read_csv(value_file_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_features):
        sentiment_opinion_scores = []
        sentences, features = sentences_and_features
        for sentence in sentences:
            word_count = len(sentence)
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score += 1
            sentiment_opinion_scores.append([sentiment_opinion_score])
        for i in range(len(sentiment_opinion_scores)):
            features[i] = features[i] + (sentiment_opinion_scores[i][0])
        return features


class PipelineRunner:

    def __init__(self, dict_file, training_file, test_file, log_file='/content/gdrive/MyDrive/Praxisprojekt/results/results_with_correct_input.log'):
        self.dict_file = dict_file
        self.log_file = log_file
        self.data_training = pd.read_excel(training_file, sheet_name='sentences')
        self.data_test = pd.read_excel(test_file, sheet_name='sentences')
        self.pipeline = None
        self.transformer_name_dict = {
                                      TextToSentenceTransformer.__name__: "ts",
                                      BertTransformer.__name__: "bert",
                                      PreprocessorTransformer.__name__: "prepro",
                                      SentimentOpinionValueCalculatorSingleValueTransformer.__name__: "sentval",
                                      SentimentOpinionValueCounterTransformer.__name__: "sentcount"
                                      }
       
        self.estimator_name_dict = {
            LogisticRegression.__name__: 'log_reg',
            GaussianNB.__name__: 'gau_nb',
            BernoulliNB.__name__: 'ber_nb'
        }

    def prepare_pipeline(self, data_column, estimator_type, transformer_types_list):

        if not self.pipeline:
            raise RuntimeError("Don't call this function directly use make_pipeline")

        print(f'Starting fitting: {estimator_type}')
        accuracy, f1, recall, precision = self.fit_and_predict_and_calculate_accuracy_pipe(data_column)
        description = f'\tUsed estimator: {estimator_type}\n'
        description += f'\tUsed transformers: {", ".join(transformer_types_list)}\n'
        description += f'\tColumn: {data_column}\n'
        self.write_result_to_file(accuracy, f1, recall, precision, description)

    def prepare_pipeline_confidence(self, data_column, estimator_type, transformer_types_list):

        if not self.pipeline:
            raise RuntimeError("Don't call this function directly use make_pipeline")

        print(f'Starting fitting: {estimator_type}')
        accuracy, f1, recall, precision = self.fit_and_predict_and_calculate_confidence_pipe(data_column)
        description = f'\tUsed estimator: {estimator_type}\n'
        description += f'\tUsed transformers: {", ".join(transformer_types_list)}\n'
        description += f'\tonly 60percent confidence \n'
        description += f'\tColumn: {data_column}\n'
        self.write_result_to_file(accuracy, f1, recall, precision, description)
      
        

    def make_pipeline(self, transformer_list, estimator,
                      data_column, param_gird, classifier_description='',
                      dir_path='', force_fitting=False):

        classifier_file = self.create_pipe_line_name(transformer_list, estimator, classifier_description, data_column, dir_path)
        if force_fitting or not os.path.exists(classifier_file):
            print('No classfier found for your configuration or force_fitting=True. Creating new one and saving it.')
            clf = GridSearchCV(estimator=estimator, param_grid=param_gird, n_jobs=-1, scoring='accuracy')

            self.pipeline = sklearn.pipeline.Pipeline(
                [(f'stage: {self.transformer_name_dict[type(transformer_list[index]).__name__]}',
                  transformer_list[index]) for index in range(len(transformer_list))] +
                [('clf', clf)]
            )
            self.prepare_pipeline(data_column, type(estimator).__name__, [type(t).__name__ for t in transformer_list])
            print(f'Saving classifier to file {classifier_file}')
            self.save_classifier(classifier_file)
        else:
            print(f'Classifier exists. Loaded from file {classifier_file}')
            self.load_classifier(classifier_file)

        return self.pipeline


    def create_pipe_line_name(self, transformer_list, estimator, classifier_description, data_column, dir_path=''):
        names = [self.transformer_name_dict[type(transformer).__name__] for transformer in transformer_list]
        description = ''
        if classifier_description:
            description = '_' + classifier_description
        if dir_path and not dir_path.endswith('/') and not dir_path.endswith('\\'):
            dir_path += '/'
        return dir_path + f'/content/gdrive/MyDrive/Praxisprojekt/classifier/{self.estimator_name_dict[type(estimator).__name__]}_with_{"_".join(names)}{description}_{data_column}_pipeline.joblib.plk'

    def load_classifier(self, filename):
        self.pipeline = joblib.load(filename)

    def save_classifier(self, filename):
        joblib.dump(self.pipeline, filename)

    def fit_and_predict_and_calculate_accuracy_pipe(self, data_column):
        print('Start fiting')
        self.pipeline.fit(self.data_training, self.data_training[data_column].to_numpy())
        print('Start prediction')
        start = time.time()
        y_pred_pipe = self.pipeline.predict(self.data_test)
        end = time.time()
        print(f'time needed: {end - start}')

        #return accuracy_score(self.data_test[data_column].to_numpy(), y_pred_pipe)

        acc = accuracy_score(self.data_test[data_column].to_numpy(), y_pred_pipe)
        f1 = f1_score(self.data_test[data_column].to_numpy(), y_pred_pipe, average='weighted')
        rec = recall_score(self.data_test[data_column].to_numpy(), y_pred_pipe, average='weighted')
        precision = precision_score(self.data_test[data_column].to_numpy(), y_pred_pipe, average='weighted')
        return acc, f1, rec, precision


    def fit_and_predict_and_calculate_confidence_pipe(self, data_column):
        print('Start fiting')
        self.pipeline.fit(self.data_training, self.data_training[data_column].to_numpy())
        print('Start prediction')

        # predicte die confidence für jedes Item
        y_pred_pipe = self.pipeline.predict_proba(self.data_test)
        df = pd.DataFrame(data = y_pred_pipe, columns = ['confidence class 0', 'confidence class 1'])

        # predicte die Klasse für jedes Item
        y_pred_pipe2 = self.pipeline.predict(self.data_test)
        df['prediction']=y_pred_pipe2

        # schreibe die confidence der predicted class in die Spalte max confidence
        df['max confidence'] = df[['confidence class 0', 'confidence class 1']].max(axis=1)
        
        # lösche spalten confidence class 0 und confidence class 1
        df.drop(['confidence class 0', 'confidence class 1'], axis=1)

        # neuen dataframe erstellen mit den richtigen Klassen, den predicted classes und den dazugehörigen confidences
        df2= pd.DataFrame(data=self.data_test[data_column].to_numpy(), columns=['real sentiment label'])
        df2['prediction'] = df['prediction']
        df2['confidence'] = df['max confidence']

        # alle Zeilen aussortieren, die eine confidence unter 60 haben
        df3 = pd.DataFrame({'real sentiment label':[100], 'prediction':[100], 'confidence':[100]})
        for ind in df2.index:
          if (df2['confidence'][ind]) >0.6:
            item = pd.DataFrame({'real sentiment label':[df2['real sentiment label'][ind]], 
                                 'prediction': [df2['prediction'][ind]], 'confidence':[df2['confidence'][ind]]})
            df3 = df3.append(item, ignore_index=True) 

        # dummy Zeile wieder löschen
        df3.drop(df3.head(1).index, inplace=True)


        # zum Schluss alles in die excel speichern
        df.to_excel('/content/gdrive/MyDrive/Praxisprojekt/results/max_confidence.xlsx')
        df2.to_excel('/content/gdrive/MyDrive/Praxisprojekt/results/classes_and_confidence.xlsx')
        df3.to_excel('/content/gdrive/MyDrive/Praxisprojekt/results/confidence_60.xlsx')
        

        acc = accuracy_score(df3['real sentiment label'], df3['prediction'])
        f1 = f1_score(df3['real sentiment label'], df3['prediction'], average='weighted')
        rec = recall_score(df3['real sentiment label'], df3['prediction'], average='weighted')
        precision = precision_score(df3['real sentiment label'], df3['prediction'], average='weighted')
        return acc, f1, rec, precision


    def write_result_to_file(self, accuracy, f1, recall, precision, description):
        with open(self.log_file, 'a', encoding='utf-8') as file:
            file.write('#------------------------------------------------------------------------------------------\n')
            file.write(f'{datetime.now().strftime("%b-%d-%Y %H:%M:%S")}\n')
            file.write(f'{description}\n')
            file.write(f'\t\tAccuracy: {accuracy}\n')
            file.write(f'\t\tF1 Score: {f1}\n')
            file.write(f'\t\tRecall Score: {recall}\n')
            file.write(f'\t\tPrecision Score: {precision}\n')
            file.write('#------------------------------------------------------------------------------------------\n')

    def predict_data(self, data_file_name, column_to_transform=None, new_column_name=None, batch_size=1, result_for_column='', log_file=f'results/results_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log'):
        data_validation = pd.read_csv(data_file_name)

        if column_to_transform is not None:
            for obj in self.pipeline.named_steps.values():
                if issubclass(type(obj), ColumnTransformer):
                    obj.set_column_to_transform(column_to_transform)

        if new_column_name is not None:
            for obj in self.pipeline.named_steps.values():
                if issubclass(type(obj), ColumnUser):
                    obj.set_column_to_use(new_column_name)

        # Sollte so jetzt funktionieren, da Bert jetzt intern schon Stückweise die Daten einliest, falls nicht  die Zeilen hier auskommentieren und die auskommentierten einkommentieren
        output = self.pipeline.predict(data_validation)

        with open(log_file, 'a', encoding='utf-8') as f:
            f.write('#-------------------------------------------------')
            f.write(f'result for column {result_for_column}:\n')
            f.write(f'{output.tolist()}')
            f.write('#-------------------------------------------------\n\n\n\n\n')

        # with open(log_file, 'a', encoding='utf-8') as f:
        #     f.write('#-------------------------------------------------')
        #     f.write(f'result for column {result_for_column}:\n')
        #
        # row_count = data_validation.shape[0]
        # counter = 0
        # start_index = 0
        #
        # while start_index < row_count:
        #
        #     output = self.pipeline.predict(data_validation.loc[start_index: start_index + batch_size])
        #     with open(log_file, 'a', encoding='utf-8') as f:
        #         f.write(f'{output.tolist()}')
        #         f.write('\n')
        #     start_index += batch_size + 1 # das plus 1 kommt daher, dass bei den pandas Dataframes start und end index inklusive sind
        #     counter += 1
        #     if counter % 10 == 0:
        #         print(f'{min(100.0, round((((start_index) / row_count) * 100), 2))}% done')
        #
        # with open(log_file, 'a', encoding='utf-8') as f:
        #     f.write('#-------------------------------------------------\n\n\n\n\n')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[{'label': 'POSITIVE', 'score': 0.9998704791069031}]


In [83]:
dir_path = ''
# dict_file = dir_path + 'AFINN-both-abs.csv'
dict_file = dir_path + '/content/gdrive/MyDrive/Praxisprojekt/sentiment_dict.csv'

training_file = dir_path + '/content/gdrive/MyDrive/Praxisprojekt/Trainingdata_train.xlsx'

test_file = dir_path + '/content/gdrive/MyDrive/Praxisprojekt/Trainingdata_test.xlsx'

#transformers_list = [TextToSentenceTransformer('text', 'Sentence'),
 #                    BertTransformer('Sentence'),
  #                   PreprocessorTransformer('Sentence'),
   #                  SentimentOpinionValueCalculatorSingleValueTransformer(dict_file)]
transformers_list = [TextToSentenceTransformer('text', 'Sentence'),
                     PreprocessorTransformer('Sentence'),
                     SentimentOpinionValueCalculatorSingleValueTransformer(dict_file)]

pipeline_runner = PipelineRunner(dict_file, training_file, test_file, log_file=dir_path + '/content/gdrive/MyDrive/Praxisprojekt/results/results_with_dict_only.log')
Cs = np.logspace(-6, 6, 200)
max_iter = 500
log_reg_subjopin = LogisticRegression(max_iter=max_iter)
#pipeline_runner.make_pipeline(transformers_list, log_reg_subjopin, 'SUBJopin01', dict(C=Cs), dir_path=dir_path)

#pipeline_runner.predict_data(data_file_name=dir_path + 'MBFC_Dataset_Sample.csv',
#                             result_for_column='SUBJopin',
#                             log_file=dir_path + f'results/mbfc_results_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_SUBJopin.log')

log_reg_subjlang = LogisticRegression(max_iter=max_iter)
pipeline_runner.make_pipeline(transformers_list, log_reg_subjlang, 'SUBJlang01', dict(C=Cs), dir_path=dir_path)

# pipeline_runner.predict_data(data_file_name=dir_path + 'MBFC_Dataset_Sample.csv',
#                              result_for_column='SUBJlang',
#                              log_file=dir_path + f'results/mbfc_results_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}_SUBJlang.log')

Classifier exists. Loaded from file /content/gdrive/MyDrive/Praxisprojekt/classifier/log_reg_with_ts_prepro_sentval_SUBJlang01_pipeline.joblib.plk




Pipeline(memory=None,
         steps=[('stage: ts',
                 TextToSentenceTransformer(column_to_transform='text',
                                           filename=None,
                                           new_column_name='Sentence')),
                ('stage: prepro', PreprocessorTransformer(column='Sentence')),
                ('stage: sentval',
                 SentimentOpinionValueCalculatorSingleValueTransformer(dict_name='/content/gdrive/MyDrive/Praxisprojekt/sentiment_dict.csv')),
                ('clf',
                 GridS...
       7.14942899e+04, 8.21434358e+04, 9.43787828e+04, 1.08436597e+05,
       1.24588336e+05, 1.43145894e+05, 1.64467618e+05, 1.88965234e+05,
       2.17111795e+05, 2.49450814e+05, 2.86606762e+05, 3.29297126e+05,
       3.78346262e+05, 4.34701316e+05, 4.99450512e+05, 5.73844165e+05,
       6.59318827e+05, 7.57525026e+05, 8.70359136e+05, 1.00000000e+06])},
                              pre_dispatch='2*n_jobs', refit=True,
              