<a href="https://colab.research.google.com/github/PascalBreuer/inl-meet-ir-v2/blob/Pascal/Bert_and_sentiment_dict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 6.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 36.8MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 32.7MB/s 
Installing c

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [24]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import re
import sklearn
import string
import nltk
from collections import Counter
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB, ComplementNB, BernoulliNB, CategoricalNB, MultinomialNB

from transformers import pipeline 
print(pipeline('sentiment-analysis')('we love you'))

import transformers as ppb
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score

nltk.download('stopwords')

from sklearn.base import BaseEstimator, TransformerMixin


# Man kann auch mehrere hintereinander machen

class BertTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
      pass

    
    def fit(self, X, y=None):
      return self

    def transform(self, data):
      model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

      # Load pretrained model/tokenizer
      tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
      model = model_class.from_pretrained(pretrained_weights)

      #Sätze zerstückeln lassen
      dataList = data['Sentence'].tolist()
      dataList = list((str(s) for s in dataList))
      tokenized = []
      
      for s in dataList:
        tokenized.append(tokenizer.encode(s, add_special_tokens=True))

      #Padding hinzufügen 
      max_len = 0
      for i in tokenized:
          if len(i) > max_len:
              max_len = len(i)

      padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])

      #Maske erstellen, um das Padding bei der Verarbeitung zu filtern
      mask = np.where(padded != 0, 1, 0)
      mask.shape

      #mache padded Array und Maske zu einem Tensor
      #Tensor = mehrdimensionale Matrix mit einheitlichem Datentyp
      input = torch.tensor(padded)  
      mask = torch.tensor(mask)

      #gib unser Zeug an BERT
      #no_grad = Angabe zur Simplifikation des Rechenvorgangs
      with torch.no_grad():
          output = model(input, attention_mask=mask)

      #nur die erste Spalte auslesen = von BERT geschriebene Kennwerte
      features = output[0][:,0,:].numpy()

      return (data, features)


class PreprocessorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, data_and_features, y=None):
        data, features = data_and_features
        sentences = data['Sentence'].tolist()
        sentences = list((str(s) for s in sentences))

        # muss vom generator object zurück zur liste gemacht werden
        sentences = list((s.lower() for s in sentences))

        table = str.maketrans('', '', string.punctuation)
        sentences = [s.translate(table) for s in sentences]

        sentences = [re.sub(r'\d+', 'num', s) for s in sentences]

        stopwords = set(nltk.corpus.stopwords.words('english'))
        sentences = [[word for word in s.split() if word not in stopwords] for s in sentences]
        return (sentences, features)


class SentimentOpinionValueCalculatorSingleValueTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value_file_name):
        df = pd.read_csv(value_file_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()
        # for key in self.value_dict.keys():
        #     print(f'{key} - {self.value_dict[key]}')
        # print(f'unique values: {df.nunique()}')
        # print(self.value_dict)
        # print(len(self.value_dict))
        # d = {}
        # for word in df.word:
        #     d[word] = df.value[df.word == word]
        # for key in d.keys():
        #     print(f'{key} - {d[key]}')

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_features):
        sentiment_opinion_scores = []
        sentences, features = sentences_and_features
        for sentence in sentences:
            word_count = len(sentence)
            # print(f'length of sentence {sentence} = {word_count}')
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score = sentiment_opinion_score + self.value_dict[word]
                sentiment_opinion_score = sentiment_opinion_score / word_count
            sentiment_opinion_scores.append([sentiment_opinion_score])
        for i in range(len(sentiment_opinion_scores)):
            features[i]+(sentiment_opinion_scores[i][0])
        return features



class SentimentOpinionValueCounterTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value_file_name):
        df = pd.read_csv(value_file_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_features):
        sentiment_opinion_scores = []
        sentences, features = sentences_and_features
        for sentence in sentences:
            word_count = len(sentence)
            # print(f'length of sentence {sentence} = {word_count}')
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score += 1
                # sentiment_opinion_score = sentiment_opinion_score / word_count
            sentiment_opinion_scores.append([sentiment_opinion_score])
        for i in range(len(sentiment_opinion_scores)):
            features[i]+(sentiment_opinion_scores[i][0])
        return features



def fit_and_predict_and_calculate_accuracy_pipe(pipe, train_input, train_ouput, test_input, test_output):
    pipe.fit(train_input, train_ouput)

    y_pred_pipe = pipe.predict(test_input)

    return accuracy_score(y_pred_pipe, test_output)



dir_path = '/content/gdrive/MyDrive/Praxisprojekt/'
dict_file = dir_path + 'AFINN-both-abs.csv'
# dict_file = dir_path + '/sentiment_dict.csv'

test_file = dir_path + 'Trainingdata_train.xlsx'
data_fit = pd.read_excel(test_file, sheet_name='sentences')
data_fit.drop(['SUBJindl', 'SUBJsrce', 'SUBJrhet', 'SUBJster', 'SUBJspee', 'SUBJinspe', 'SUBJprop', 'SUBJpolit'],
              axis=1,
              inplace=True)

y_opin = data_fit.SUBJopin.to_numpy()
y_lang = data_fit.SUBJlang.to_numpy()

test_file = dir_path + 'Trainingdata_test.xlsx'
data_test = pd.read_excel(test_file, sheet_name='sentences')
data_test.drop(['SUBJindl', 'SUBJsrce', 'SUBJrhet', 'SUBJster', 'SUBJspee', 'SUBJinspe', 'SUBJprop', 'SUBJpolit'],
                axis=1, inplace=True)


y_opin_test = data_test.SUBJopin.to_numpy()
y_lang_test = data_test.SUBJlang.to_numpy()


Cs = np.logspace(-6, -1, 10)
l1_ratio = np.random.uniform(size=20)
penalty = ['l1', 'l2', 'elasticnet']
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

log_reg = sklearn.linear_model.LogisticRegression()
clf_log_reg_sentiment_dict = GridSearchCV(estimator=log_reg,
                                          param_grid=dict(C=Cs, penalty=penalty, solver=solver, l1_ratio=l1_ratio),
                                          n_jobs=-1, scoring='accuracy', verbose=0)

pipe_log_reg_sentiment_dict = sklearn.pipeline.make_pipeline(BertTransformer(),
                                                              PreprocessorTransformer(),
                                                              SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                              # SentimentOpinionValueCounterTransformer(dict_file),
                                                              clf_log_reg_sentiment_dict)

gnb2 = GaussianNB()
clf_gaussian_nb_sentiment_dict = GridSearchCV(estimator=gnb2, param_grid=dict(var_smoothing=Cs), n_jobs=-1,
                                              scoring='accuracy')

pipe_gaussian_nb_sentiment_dict = sklearn.pipeline.make_pipeline(BertTransformer(),
                                                                  PreprocessorTransformer(),
                                                                  SentimentOpinionValueCalculatorSingleValueTransformer( dict_file),
                                                                  # SentimentOpinionValueCounterTransformer(dict_file),
                                                                  clf_gaussian_nb_sentiment_dict)

cnb = ComplementNB()
clf_complement_nb_sentiment_dict = GridSearchCV(estimator=cnb, param_grid=dict(alpha=Cs), n_jobs=-1,
                                                scoring='accuracy')

pipe_complement_nb_sentiment_dict = sklearn.pipeline.make_pipeline(BertTransformer(),
                                                                    PreprocessorTransformer(),
                                                                    SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                                    # SentimentOpinionValueCounterTransformer(dict_file),
                                                                    clf_complement_nb_sentiment_dict)

mnb = MultinomialNB()
clf_multinomial_nb_sentiment_dict = GridSearchCV(estimator=mnb, param_grid=dict(alpha=Cs), n_jobs=-1,
                                                  scoring='accuracy')

pipe_multinomial_nb_sentiment_dict = sklearn.pipeline.make_pipeline(BertTransformer(),
                                                                    PreprocessorTransformer(),
                                                                    SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                                    # SentimentOpinionValueCounterTransformer(dict_file),
                                                                    clf_multinomial_nb_sentiment_dict)

bnb = BernoulliNB()
clf_bernoulli_nb_sentiment_dict = GridSearchCV(estimator=bnb, param_grid=dict(alpha=Cs, binarize=Cs), n_jobs=-1,
                                                scoring='accuracy')

pipe_bernoulli_nb_sentiment_dict = sklearn.pipeline.make_pipeline(BertTransformer(),
                                                                  PreprocessorTransformer(),
                                                                  SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                                  # SentimentOpinionValueCounterTransformer(dict_file),
                                                                  clf_bernoulli_nb_sentiment_dict)

print('Results for Sentiment-dict:')

y_test = y_lang_test
y_fit = y_lang

accuracy_log_reg_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(pipe_log_reg_sentiment_dict,
                                                                              data_fit, y_fit, data_test, y_test)
print(f'Accuracy Logistic Regression for Sentiment-dict: {accuracy_log_reg_sentiment_dict}')

accuracy_gaussian_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(pipe_gaussian_nb_sentiment_dict,
                                                                                  data_fit, y_fit, data_test,
                                                                                  y_test)
print(f'Accuracy Gaussian Navie Bayes for Sentiment-dict: {accuracy_gaussian_nb_sentiment_dict}')

accuracy_complement_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(
    pipe_complement_nb_sentiment_dict,
    data_fit, y_fit, data_test, y_test)
print(f'Accuracy Complement Navie Bayes for Sentiment-dict: {accuracy_complement_nb_sentiment_dict}')

accuracy_multinomial_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(
    pipe_multinomial_nb_sentiment_dict,
    data_fit, y_fit, data_test, y_test)
print(f'Accuracy Multinomial Navie Bayes for Sentiment-dict: {accuracy_multinomial_nb_sentiment_dict}')

accuracy_bernoulli_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(pipe_bernoulli_nb_sentiment_dict,
                                                                                    data_fit, y_fit, data_test,
                                                                                    y_test)
print(f'Accuracy Bernoulie Navie Bayes for Sentiment-dict: {accuracy_bernoulli_nb_sentiment_dict}')


[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Results for Sentiment-dict:


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyboardInterrupt: ignored