<a href="https://colab.research.google.com/github/PascalBreuer/inl-meet-ir-v2/blob/main/ReadingDatasheet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).



# Daten in Training-, Test- und Validation-Menge aufteilen


In [None]:
# Importiert pandas unter dem Synonym pd
import pandas as pd
    
# Der Pfad für die Dateien wird hier als Variable erstellet, damit man ihn nur einmal aufschreiben muss und auch bei Änderungen nur eine Stelle hat die geändert werden muss.
file_path = '/content/gdrive/MyDrive/Praxisprojekt/'

# Das Mounten von Google-Drive passiert hier nochmal damit man diese Zelle für sich alleine ausführen kann
from google.colab import drive
drive.mount('/content/gdrive')

# liest die Datei ein. Dabei wird nur der Inhalt vom Blatt 'sentences' eingelesen
test_data = pd.read_excel(file_path + 'Trainingdata.xlsx', sheet_name='sentences')

# Zählt die Anzahl an verschiedenen IDs. Das ist auch die Anzahl an Artikeln die wir zum Testen haben. Wir sollen die Testdaten anhand der Artikel teilen nicht anhand der Sätze.
ids = test_data.ID.unique()

# In den nächsten drei Zeilen werden alle Sätze eines Artikels in einem Listen Element gespeichert. Das heißt das alle Sätze die zu dem Artikel mit der ID '1A-B' gehören im selbem Listenelement liegen
texts = []
for i in ids:
    # Pandas bietet die Möglichkeit Daten in einem Dateframe zu selectieren indem man ihm sagt: "Selektiere alle Zeilen bei denen die ID = i ist".
    # ID ist dabei eine Spalte des Dataframes
    texts.append(test_data[test_data.ID == i])

# Anzahl der Artikel auslesen
id_count = len(texts)

# Die Prozentzahlen der Trainings-, Test- und Validierungs-Menge sind hier alle eigene Variablen, damit man sie nur an einer Stelle ändern muss, wenn man eine andere Aufteilung der Daten haben möchte.
train_percent = .7
test_percent = .15
validation_percent = .15

# Hier wird die Anzahl der Elemente der jeweiligen Menge bestimmt.
# Man muss das ganze zu einem int casten, weil die Prozentzahlen floats sind.
train_count = int(id_count * train_percent)
test_count = int(id_count * test_percent)
validation_count = int(id_count * validation_percent)

# Hier wird mit der Slice-Notation (siehe hier für mehr Informationen https://docs.python.org/3/tutorial/datastructures.html) genutzt um die richtigen Daten zu erhalten
train_set = texts[:train_count]
test_set = texts[train_count: (train_count + test_count)]
validation_set = texts[(train_count + test_count):]

# Als nächstes wird für die Trainings-, Test- und Validierungs-Menge jeweils ein eigener Dataframe angelegt.
# Dies tut man damit man ihn einfach mit Pandas speichern kann.
train_frame = train_set[0]
for train_index in range(1, len(train_set)):
    train_frame = train_frame.append(train_set[train_index])
    
test_frame = test_set[0]
for test_index in range(1, len(test_set)):
    test_frame = test_frame.append(test_set[test_index])
    
validation_frame = validation_set[0]
for validation_index in range(1, len(validation_set)):
    validation_frame = validation_frame.append(validation_set[validation_index])

# Jetzt werden die Daten an den übergebenen Pfad geschrieben
train_frame.to_excel(file_path + 'Trainingdata_train.xlsx', index=False, sheet_name='sentences')
test_frame.to_excel(file_path + 'Trainingdata_test.xlsx', index=False, sheet_name='sentences')
validation_frame.to_excel(file_path + 'Trainingdata_validation.xlsx', index=False, sheet_name='sentences')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Pipeline erzeugen und customized Transformer

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

# Man kann auch mehrere hintereinander machen

class SentenceTransformer(BaseEstimator, TransformerMixin):
    
    def __init(self):
        print('init() called')
        
    def fit(self, X, y=None):
        print('fit() called')
        return self
    
    def transform(self, X, y=None):
        print('transformed called')
        # Hier können wir dann unsere Sätze bearbeiten und zu numerischen Werten machen
        return X
    
    

from sklearn import linear_model
    
print('create Pipline')
pipe = Pipeline(steps=[
    ('sentence_trans', SentenceTransformer()),
    ('sgd', linear_model.SGDClassifier())
])

import sklearn.pipeline

pipe2 = sklearn.pipeline.make_pipeline(SentenceTransformer(), linear_model.LinearRegression())

from sklearn import datasets
iris = datasets.load_iris()

pipe.fit(iris.data, iris.target)
pipe2.fit(iris.data, iris.target)


create Pipline
fit() called
transformed called
fit() called
transformed called


Pipeline(memory=None,
         steps=[('sentencetransformer', SentenceTransformer()),
                ('linearregression',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

# Daten preprocessen

In [None]:
#https://blog.cambridgespark.com/tutorial-preprocessing-text-data-a8969189b779
import pandas as pd
import re
import sklearn
import string
import nltk
import spacy
nltk.download('stopwords')



data = pd.read_excel('/content/gdrive/MyDrive/Praxisprojekt/Trainingdata_train.xlsx', sheet_name = 'sentences')
sentences = data['Sentence'].tolist()
sentences = list((str(s) for s in sentences))
print("Sentences:")
print(sentences)

#muss vom generator object zurück zur liste gemacht werden
sentences = list((s.lower() for s in sentences))
print("Lower-case Sentences:")
print(sentences)

table = str.maketrans('', '', string.punctuation)
sentences = [s.translate(table) for s in sentences]
print("Sentences without punctuation:")
print(sentences)

#zahlen rausfiltern, wird von named entities schon übernommen
#sentences = [re.sub(r'\d+', 'num', s) for s in sentences]
#print("Sentences with 'num' instead of numbers:")
#print(sentences)


#remove named entities
nlp = spacy.load("en_core_web_sm")
sentences2 = []
for s in sentences:
  text_no_namedentities = []
  document = nlp(s)
  for item in document:
    if item.ent_type: #falls es ein name ist
      #text_no_namedentities.append('ne')        # durch ne ersetzen
      pass                                        # oder ganz auslassen
    else:
        text_no_namedentities.append(item.text)
  sentences2.append(" ".join(text_no_namedentities))
#print("Sentences with 'ne' instead of named entities")
print("Sentences without named entities")
print(sentences2)

stopwords = set(nltk.corpus.stopwords.words('english')+ ['liam', 'hemsworth', 'cyrus'])       #diese drei werden nicht als named entities erkannt, daher werden sie hier rausgefiltert
sentences2 = [[word for word in s.split() if word not in stopwords] for s in sentences2]
print("Sentences without stopwords:")
print(sentences2)

stemmer = nltk.stem.PorterStemmer()
sentences2 = [[stemmer.stem(word) for word in s] for s in sentences2]
print("Sentences with stemming:")
print(sentences2)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Sentences:
Lower-case Sentences:
Sentences without punctuation:
Sentences without named entities
Sentences without stopwords:
Sentences with stemming:
[['fight', 'fals', 'rumor', 'swirl', 'feud', 'suppos', 'prenup'], ['fight'], ['new', 'report', 'claim', 'huge', 'fight', 'suppos', 'prenup', 'make', 'wed', 'plan'], ['accord', 'report', 'plan', 'wed', 'summer'], ['reportedli', 'tension', 'prenup'], ['reportedli', 'told', 'want', 'sign', 'prenup', 'got', 'marri', 'could', 'protect', 'net', 'worth'], ['reportedli', 'shock', 'annoy', 'accus', 'trust'], ['get', 'marri', 'summer', 'click', 'detail'], ['sourc', 'say', 'fight', 'prenup', 'photo', 'splash', 'news'], ['problem'], ['report', 'true'], ['sourc', 'told', 'gossip', 'cop', 'nt', 'drama', 'prenup'], ['nt', 'even', 'plan', 'get', 'marri', 'summer'], ['far', 'time', 'face', 'fals', 'rumor', 'relationship', 'face', 'consta

# Bag-of-Words: Dictionary aus Text erzeugen

In [None]:
#INPUT: Eine Liste mit bereits in Wörtern geteilten Sätzen
#WANTED OUTPUT: Eine Liste mit Dictionaries
#PREDICTED CODE LENGTH: Like 5 lines or something idk 
import pandas as pd
data = pd.read_excel('/content/gdrive/MyDrive/Praxisprojekt/Trainingdata_train.xlsx', sheet_name = 'sentences')     #nur training_train verwenden
sentences = data['Sentence'].tolist()

#primitive Funktion, um Sätze in Wörter (ohne Satzzeichen) zu splitten
#tötet Umlaute und andere Sonderzeichen
#sollte definitiv noch ersetzt werden
import re
def makeWordList(s):
  #non-alphanumerische Zeichen durch Leerzeichen ersetzen und dann an Leerzeichen splitten
  return re.sub("[^\w]", " ", s).split()

#Variante, die eine Liste von Sätzen annimmt
def makeWordListMulti(s):
  result = []
  for sentence in s:
    result = result + makeWordList(sentence)
  return result

#Dictionary aus Wörterliste erstellen
from collections import Counter
def makeDictionary(words):
 c = Counter(words)
 dic = {}
 for key in c:
  dic[key] = c[key]
 return dic



#vvvvvvvvvvvvvvvvvv AB HIER TESTING vvvvvvvvvvvvvvvvvv

#Teste am ersten Satz im Trainingsset
wordList = makeWordList(sentences[0])
print("Created list of words:")
print(wordList)

dic = makeDictionary(wordList)
print("Created dictionary:") 
print(dic)

print("Teste Variante mit ganzen Artikeln...")

ids = data.ID.unique()
articles = []
for id in ids:
 #Gruppiere nach ID, dann nimm nur die Strings als Liste
 articles.append(data[data.ID == id]['Sentence'].tolist())

#Teste wieder am ersten Artikel im Trainingsset
wordList = makeWordListMulti(articles[0])
dic = makeDictionary(wordList)
print("Large Dictionary incoming:")
print(dic)

Created list of words:
['miley', 'and', 'liam', 'fighting', 'false', 'rumors', 'swirl', 'that', 'theyre', 'in', 'a', 'feud', 'over', 'a', 'supposed', 'prenup']
Created dictionary:
{'miley': 1, 'and': 1, 'liam': 1, 'fighting': 1, 'false': 1, 'rumors': 1, 'swirl': 1, 'that': 1, 'theyre': 1, 'in': 1, 'a': 2, 'feud': 1, 'over': 1, 'supposed': 1, 'prenup': 1}
Teste Variante mit ganzen Artikeln...
Large Dictionary incoming:
{'miley': 9, 'and': 29, 'liam': 7, 'fighting': 5, 'false': 2, 'rumors': 6, 'swirl': 1, 'that': 8, 'theyre': 4, 'in': 5, 'a': 12, 'feud': 1, 'over': 6, 'supposed': 2, 'prenup': 6, 'are': 4, 'cyrus': 15, 'hemsworth': 17, 'new': 1, 'report': 3, 'claims': 1, 'had': 1, 'huge': 1, 'fight': 1, 'their': 8, 'while': 2, 'they': 10, 'were': 3, 'making': 1, 'wedding': 6, 'plans': 2, 'according': 1, 'to': 10, 'the': 14, '24': 1, '27': 1, 'have': 4, 'been': 3, 'planning': 1, 'himalayan': 2, 'mountains': 2, 'this': 5, 'summer': 3, 'but': 3, 'there': 3, 'has': 1, 'reportedly': 3, 'tensio

# Teile zusammenführen

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import re
import sklearn
import string
import nltk
nltk.download('stopwords')
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score



class PreprocessorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
      pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, data, y=None):
        sentences = data['Sentence'].tolist()
        sentences = list((str(s) for s in sentences))

        # Alles klein schreiben
        # muss vom generator object zurück zur Liste gemacht werden
        sentences = list((s.lower() for s in sentences))

        # Satzzeichen entfernen
        table = str.maketrans('', '', string.punctuation)
        sentences = [s.translate(table) for s in sentences]

        # ersetzt Zahlen durch den String "num"
        #sentences = [re.sub(r'\d+', 'num', s) for s in sentences]

        #remove named entities
        nlp = spacy.load("en_core_web_sm")
        sentences2 = []
        for s in sentences:
          text_no_namedentities = []
          document = nlp(s)
          for item in document:
            if item.ent_type: #falls es ein name ist
              #text_no_namedentities.append('ne')        # durch ne ersetzen
              pass                                        # oder ganz auslassen
            else:
              text_no_namedentities.append(item.text)
          sentences2.append(" ".join(text_no_namedentities))

        # entfernt stopwords
        stopwords = set(nltk.corpus.stopwords.words('english')+ ['liam', 'hemsworth', 'cyrus'])
        sentences2 = [[word for word in s.split() if word not in stopwords] for s in sentences2]
        
        # führte Wort-Stemming durch
        stemmer = nltk.stem.PorterStemmer()
        sentences2 = [[stemmer.stem(word) for word in s] for s in sentences2]

        return sentences2
    



class BagOfWordsForEachSentenceTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
      pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, sentences, y=None):
        print('transformed called')
        dics = []

        #für jeden Satz wird ein Dictionary konstruiert
        for s in sentences:
          c = Counter(s)
          dic = []
          for key in c:
            dic[key]=(c[key])
          dics.append(dic)

        return (sentences, dics)
    
    
class BagOfWordsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        # Kann man eventuell besser machen und muss man nicht so machen
        self.bigdict = {}
        self.training = True
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, sentences, y=None):
        if self.training:
          from collections import Counter
          self.bigdict = {}
          for s in sentences:
            c = Counter(s)
            for key in c:
              if key in self.bigdict:
                self.bigdict[key] = self.bigdict[key] + c[key]
              else:
                self.bigdict[key] = 1
          self.training = False

        return (sentences, self.bigdict)


class SentenceToVectorTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
      pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, sentence_dict_tuple, y=None):
        # Tupel zerlegen
        sentence_fragments, dicts = sentence_dict_tuple
        sorted_keys = [key for key in dicts.keys()]
        retval = []
        for sentence_fragment in sentence_fragments:
          vec = []
          for key in sorted_keys:
            if key in sentence_fragment:
              vec.append(1)
            else:
              vec.append(0)
          retval.append(vec)
        return retval


# Erstelle Pipeline
pipe_LR = sklearn.pipeline.make_pipeline(PreprocessorTransformer(), BagOfWordsTransformer(), SentenceToVectorTransformer(), sklearn.linear_model.LogisticRegression())
pipe_NB = sklearn.pipeline.make_pipeline(PreprocessorTransformer(), BagOfWordsTransformer(), SentenceToVectorTransformer(), GaussianNB())

# Lade Datensätze
data_train = pd.read_excel('/content/gdrive/MyDrive/Praxisprojekt/Trainingdata_train.xlsx', sheet_name = 'sentences')
data_train.drop(['SUBJindl', 'SUBJsrce', 'SUBJrhet', 'SUBJster', 'SUBJspee', 'SUBJinspe', 'SUBJprop', 'SUBJpolit'], axis=1, inplace=True)

data_test = pd.read_excel('/content/gdrive/MyDrive/Praxisprojekt/Trainingdata_test.xlsx', sheet_name = 'sentences')
data_test.drop(['SUBJindl', 'SUBJsrce', 'SUBJrhet', 'SUBJster', 'SUBJspee', 'SUBJinspe', 'SUBJprop', 'SUBJpolit'], axis=1, inplace=True)


print("results for sentiment:")
y_lang = data_train.SUBJlang.to_numpy().astype(int)
pipe_LR.fit(data_train, y_lang)
y_lang_test = data_test.SUBJlang.to_numpy()
y_pred_LR = pipe_LR.predict(data_test)
accuracy_LR = accuracy_score(y_pred_LR, y_lang_test)
print(f'Accuracy logistic Regression: {accuracy_LR}')
pipe_NB.fit(data_train, y_lang)
y_pred_NB = pipe_NB.predict(data_test)
accuracy_NB = accuracy_score(y_pred_NB, y_lang_test)
print(f'Accuracy Naive Bayes: {accuracy_NB}')


print("results for opinion:")
y_opin = data_train.SUBJopin.to_numpy().astype(int)
pipe_LR.fit(data_train, y_opin)
y_opin_test = data_test.SUBJopin.to_numpy()
y_pred_LR = pipe_LR.predict(data_test)
accuracy_LR = accuracy_score(y_pred_LR, y_opin_test)
print(f'Accuracy logistic Regression: {accuracy_LR}')
pipe_NB.fit(data_train, y_opin)
y_pred_NB = pipe_NB.predict(data_test)
accuracy_NB = accuracy_score(y_pred_NB, y_opin_test)
print(f'Accuracy Naive Bayes: {accuracy_NB}')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
results for sentiment:
Accuracy logistic Regression: 0.529296875
Accuracy Naive Bayes: 0.4453125
results for opinion:
Accuracy logistic Regression: 0.650390625
Accuracy Naive Bayes: 0.580078125
