In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import re
import sklearn
import string
import nltk
from collections import Counter
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.naive_bayes import GaussianNB, ComplementNB, BernoulliNB, CategoricalNB, MultinomialNB

from sklearn.metrics import accuracy_score

nltk.download('stopwords')

from sklearn.base import BaseEstimator, TransformerMixin


# Man kann auch mehrere hintereinander machen


class PreprocessorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, data, y=None):
        sentences = data['Sentence'].tolist()
        sentences = list((str(s) for s in sentences))

        # muss vom generator object zurück zur liste gemacht werden
        sentences = list((s.lower() for s in sentences))

        table = str.maketrans('', '', string.punctuation)
        sentences = [s.translate(table) for s in sentences]

        sentences = [re.sub(r'\d+', 'num', s) for s in sentences]

        stopwords = set(nltk.corpus.stopwords.words('english'))
        sentences = [[word for word in s.split() if word not in stopwords] for s in sentences]
        return sentences


class BagOfWordsForEachSentenceTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, sentences, y=None):
        print('transformed called')
        # Hier können wir dann unsere Sätze bearbeiten und zu numerischen Werten machen
        dics = []

        # für jeden Satz wird ein Dictionary konstruiert
        for s in sentences:

            c = Counter(s)
            dic = []
            for key in c:
                dic[key] = (c[key])
            dics.append(dic)

        return (sentences, dics)


class BagOfWordsTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        # Kann man eventuell besser machen und muss man nicht so machen
        self.bigdict = {}
        self.training = True

    def fit(self, X, y=None):
        return self

    def transform(self, sentences, y=None):
        if self.training:
            # Hier können wir dann unsere Sätze bearbeiten und zu numerischen Werten machen
            self.bigdict = {}
            for s in sentences:
                c = Counter(s)
                for key in c:
                    if key in self.bigdict:
                        self.bigdict[key] = self.bigdict[key] + c[key]
                    else:
                        self.bigdict[key] = 1
            self.training = False

        return (sentences, self.bigdict)


class SentimentOpinionValueCalculatorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value_file_name):
        df = pd.read_csv(value_file_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()
        # for key in self.value_dict.keys():
        #     print(f'{key} - {self.value_dict[key]}')
        # print(f'unique values: {df.nunique()}')
        # print(self.value_dict)
        # print(len(self.value_dict))
        # d = {}
        # for word in df.word:
        #     d[word] = df.value[df.word == word]
        # for key in d.keys():
        #     print(f'{key} - {d[key]}')

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_dict):
        sentiment_opinion_scores = []
        sentences, d = sentences_and_dict
        for sentence in sentences:
            word_count = len(sentence)
            # print(f'length of sentence {sentence} = {word_count}')
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score = sentiment_opinion_score + self.value_dict[word]
                sentiment_opinion_score = sentiment_opinion_score / word_count
            sentiment_opinion_scores.append([sentiment_opinion_score])
        return (sentences, d, sentiment_opinion_scores)


class SentimentOpinionValueCalculatorSingleValueTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value_file_name):
        df = pd.read_csv(value_file_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()
        # for key in self.value_dict.keys():
        #     print(f'{key} - {self.value_dict[key]}')
        # print(f'unique values: {df.nunique()}')
        # print(self.value_dict)
        # print(len(self.value_dict))
        # d = {}
        # for word in df.word:
        #     d[word] = df.value[df.word == word]
        # for key in d.keys():
        #     print(f'{key} - {d[key]}')

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_dict):
        sentiment_opinion_scores = []
        sentences = sentences_and_dict
        for sentence in sentences:
            word_count = len(sentence)
            # print(f'length of sentence {sentence} = {word_count}')
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score = sentiment_opinion_score + self.value_dict[word]
                sentiment_opinion_score = sentiment_opinion_score / word_count
            sentiment_opinion_scores.append([sentiment_opinion_score])
        return sentiment_opinion_scores


class SentimentOpinionValueCounterTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, value_file_name):
        df = pd.read_csv(value_file_name, sep=';')
        self.value_dict = pd.Series(df.value.values, index=df.word).to_dict()

    def fit(self, X, y=None):
        return self

    def transform(self, sentences_and_dict):
        sentiment_opinion_scores = []
        sentences = sentences_and_dict
        for sentence in sentences:
            word_count = len(sentence)
            # print(f'length of sentence {sentence} = {word_count}')
            sentiment_opinion_score = 0
            if word_count > 0:
                for word in sentence:
                    if word in self.value_dict:
                        sentiment_opinion_score += 1
                # sentiment_opinion_score = sentiment_opinion_score / word_count
            sentiment_opinion_scores.append([sentiment_opinion_score])
        return sentiment_opinion_scores


class SentenceToVectorTransformer(BaseEstimator, TransformerMixin):

    def __init__(self):
        pass

    def fit(self, X, y=None):
        print('Vec Trans  fit called')
        return self

    def transform(self, sentence_dict_tuple, y=None):
        print('Vec Trans trans called')
        # Hier können wir dann unsere Sätze bearbeiten und zu numerischen Werten machen

        # Tupel zerlegen
        sentence_fragments, dicts, sentiment_val_list = sentence_dict_tuple
        sorted_keys = [key for key in dicts.keys()]
        retval = []
        for sentence_fragment in sentence_fragments:
            vec = []
            for key in sorted_keys:
                if key in sentence_fragment:
                    vec.append(1)
                else:
                    vec.append(0)
            retval.append(vec)
        for index in range(len(sentiment_val_list)):
            # print(f'adding sentiment_val {sentiment_val_list[index]}')
            retval[index].append(int(sentiment_val_list[index][0]))
        print('added all sentiment_vals')
        return retval


def fit_and_predict_and_calculate_accuracy_pipe(pipe, train_input, train_ouput, test_input, test_output):
    pipe.fit(train_input, train_ouput)

    y_pred_pipe = pipe.predict(test_input)

    return accuracy_score(y_pred_pipe, test_output)



dir_path = '/content/gdrive/MyDrive/Praxisprojekt/'
dict_file = dir_path + 'AFINN-both-abs.csv'
# dict_file = dir_path + '/sentiment_dict.csv'

test_file = dir_path + 'Trainingdata_train.xlsx'
data_fit = pd.read_excel(test_file, sheet_name='sentences')
data_fit.drop(['SUBJindl', 'SUBJsrce', 'SUBJrhet', 'SUBJster', 'SUBJspee', 'SUBJinspe', 'SUBJprop', 'SUBJpolit'],
              axis=1,
              inplace=True)

y_opin = data_fit.SUBJopin.to_numpy()
y_lang = data_fit.SUBJlang.to_numpy()

test_file = dir_path + 'Trainingdata_test.xlsx'
data_test = pd.read_excel(test_file, sheet_name='sentences')
data_test.drop(['SUBJindl', 'SUBJsrce', 'SUBJrhet', 'SUBJster', 'SUBJspee', 'SUBJinspe', 'SUBJprop', 'SUBJpolit'],
                axis=1, inplace=True)


y_opin_test = data_test.SUBJopin.to_numpy()
y_lang_test = data_test.SUBJlang.to_numpy()


Cs = np.logspace(-6, -1, 10)
l1_ratio = np.random.uniform(size=20)
penalty = ['l1', 'l2', 'elasticnet']
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

log_reg = sklearn.linear_model.LogisticRegression()
clf_log_reg_sentiment_dict = GridSearchCV(estimator=log_reg,
                                          param_grid=dict(C=Cs, penalty=penalty, solver=solver, l1_ratio=l1_ratio),
                                          n_jobs=-1, scoring='accuracy', verbose=0)

pipe_log_reg_sentiment_dict = sklearn.pipeline.make_pipeline(PreprocessorTransformer(),
                                                              SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                              # SentimentOpinionValueCounterTransformer(dict_file),
                                                              clf_log_reg_sentiment_dict)

gnb2 = GaussianNB()
clf_gaussian_nb_sentiment_dict = GridSearchCV(estimator=gnb2, param_grid=dict(var_smoothing=Cs), n_jobs=-1,
                                              scoring='accuracy')

pipe_gaussian_nb_sentiment_dict = sklearn.pipeline.make_pipeline(PreprocessorTransformer(),
                                                                  SentimentOpinionValueCalculatorSingleValueTransformer( dict_file),
                                                                  # SentimentOpinionValueCounterTransformer(dict_file),
                                                                  clf_gaussian_nb_sentiment_dict)

cnb = ComplementNB()
clf_complement_nb_sentiment_dict = GridSearchCV(estimator=cnb, param_grid=dict(alpha=Cs), n_jobs=-1,
                                                scoring='accuracy')

pipe_complement_nb_sentiment_dict = sklearn.pipeline.make_pipeline(PreprocessorTransformer(),
                                                                    SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                                    # SentimentOpinionValueCounterTransformer(dict_file),
                                                                    clf_complement_nb_sentiment_dict)

mnb = MultinomialNB()
clf_multinomial_nb_sentiment_dict = GridSearchCV(estimator=mnb, param_grid=dict(alpha=Cs), n_jobs=-1,
                                                  scoring='accuracy')

pipe_multinomial_nb_sentiment_dict = sklearn.pipeline.make_pipeline(PreprocessorTransformer(),
                                                                    SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                                    # SentimentOpinionValueCounterTransformer(dict_file),
                                                                    clf_multinomial_nb_sentiment_dict)

bnb = BernoulliNB()
clf_bernoulli_nb_sentiment_dict = GridSearchCV(estimator=bnb, param_grid=dict(alpha=Cs, binarize=Cs), n_jobs=-1,
                                                scoring='accuracy')

pipe_bernoulli_nb_sentiment_dict = sklearn.pipeline.make_pipeline(PreprocessorTransformer(),
                                                                  SentimentOpinionValueCalculatorSingleValueTransformer(dict_file),
                                                                  # SentimentOpinionValueCounterTransformer(dict_file),
                                                                  clf_bernoulli_nb_sentiment_dict)

print('Results for Sentiment-dict:')

y_test = y_lang_test
y_fit = y_lang

accuracy_log_reg_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(pipe_log_reg_sentiment_dict,
                                                                              data_fit, y_fit, data_test, y_test)
print(f'Accuracy Logistic Regression for Sentiment-dict: {accuracy_log_reg_sentiment_dict}')

accuracy_gaussian_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(pipe_gaussian_nb_sentiment_dict,
                                                                                  data_fit, y_fit, data_test,
                                                                                  y_test)
print(f'Accuracy Gaussian Navie Bayes for Sentiment-dict: {accuracy_gaussian_nb_sentiment_dict}')

accuracy_complement_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(
    pipe_complement_nb_sentiment_dict,
    data_fit, y_fit, data_test, y_test)
print(f'Accuracy Complement Navie Bayes for Sentiment-dict: {accuracy_complement_nb_sentiment_dict}')

accuracy_multinomial_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(
    pipe_multinomial_nb_sentiment_dict,
    data_fit, y_fit, data_test, y_test)
print(f'Accuracy Multinomial Navie Bayes for Sentiment-dict: {accuracy_multinomial_nb_sentiment_dict}')

accuracy_bernoulli_nb_sentiment_dict = fit_and_predict_and_calculate_accuracy_pipe(pipe_bernoulli_nb_sentiment_dict,
                                                                                    data_fit, y_fit, data_test,
                                                                                    y_test)
print(f'Accuracy Bernoulie Navie Bayes for Sentiment-dict: {accuracy_bernoulli_nb_sentiment_dict}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Results for Sentiment-dict:
Accuracy Logistic Regression for Sentiment-dict: 0.6171875
Accuracy Gaussian Navie Bayes for Sentiment-dict: 0.625
Accuracy Complement Navie Bayes for Sentiment-dict: 0.267578125
Accuracy Multinomial Navie Bayes for Sentiment-dict: 0.611328125
Accuracy Bernoulie Navie Bayes for Sentiment-dict: 0.611328125
