In [1]:
import pandas as pd

# Load the CSV files into DataFrames
training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')

In [2]:
# Showing the first five elements of the DataFrame
training_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# Showing the last five elements of the DataFrame
training_data.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \r\n\r\nThat...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \r\n\r\nUmm, theres no actual article ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\r\nAnd ... I really don't think you understa...",0,0,0,0,0,0


In [4]:
# Showing all unique values in the classification rows
print(f"Unique values in the row \"toxic\": {training_data['toxic'].unique()}")
print(f"Unique values in the row \"severe_toxic\": {training_data['severe_toxic'].unique()}")
print(f"Unique values in the row \"obscene\": {training_data['obscene'].unique()}")
print(f"Unique values in the row \"threat\": {training_data['threat'].unique()}")
print(f"Unique values in the row \"insult\": {training_data['insult'].unique()}")
print(f"Unique values in the row \"indentity_hate\": {training_data['identity_hate'].unique()}")

Unique values in the row "toxic": [0 1]
Unique values in the row "severe_toxic": [0 1]
Unique values in the row "obscene": [0 1]
Unique values in the row "threat": [0 1]
Unique values in the row "insult": [0 1]
Unique values in the row "indentity_hate": [0 1]


In [5]:
import nltk

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tompr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

def preprocess_text(text):
    "Proprocessing the comments of the raw data"

    # Convert text to lowercase
    text = text.lower()
    # removing usernames starting with '@'
    text = re.sub(r'@ ?\w+', '', text)
    # removing URL's
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # removing the substring "quot", which is an HTML entity for double quotation ("")
    text = re.sub(r'&quot', '', text)
    # removing all special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # removing all digits
    text = re.sub(r'\d', '', text)

    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a single string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [7]:
# applies the preprocessing function to all comments 
training_data['comment_text'] = training_data['comment_text'].apply(preprocess_text)

In [8]:
print(training_data)

                      id                                       comment_text  \
0       0000997932d777bf  explanation edits made username hardcore metal...   
1       000103f0d9cfb60f  aww match background colour seemingly stuck th...   
2       000113f07ec002fd  hey man really trying edit war guy constantly ...   
3       0001b41b1c6bb37e  make real suggestion improvement wondered sect...   
4       0001d958c54c6e35                      sir hero chance remember page   
...                  ...                                                ...   
159566  ffe987279560d7ff  second time asking view completely contradicts...   
159567  ffea4adeee384e90               ashamed horrible thing put talk page   
159568  ffee36eab5c267c9  spitzer umm there actual article prostitution ...   
159569  fff125370e4aaaf3  look like actually put speedy first version de...   
159570  fff46fc426af1f9a  really think understand came idea bad right aw...   

        toxic  severe_toxic  obscene  threat  insul

In [9]:
!pip install gensim




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
!pip install scikit-learn pandas




[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


def create_tfidf_vector(data : pd.core.frame.DataFrame)->any:
    "Creates the tf-idf vector and returns it"
    print(data["comment_text"].to_list()[0:5])
    vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.8)
    tfidf_matrix = vectorizer.fit_transform(data["comment_text"].to_list())

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    return tfidf_df



In [12]:
#create_tfidf_vector(training_data)

In [13]:
# APPLY TF-IDF
#tfidf_vector = create_tfidf_vector(training_data)
#X = tfidf_vector.transform(training_data["comment_text"].to_list())

In [14]:
print(type(training_data.iloc[0]["comment_text"]))

<class 'str'>


In [15]:
""" TRAIN LINEAR REGRESSION MODEL WITH WORD2VEC VECTORIZATION"""
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

def train_word2vec_model(training_data : pd.core.frame.DataFrame)->any:
    "Trains the word2vec model and returns it"
    # converting datafram column into a list of strings
    tokenized_sentences = []
    for sentence in training_data["comment_text"].to_list():
        tokenized_sentences.append(sentence.split())

    # train word2vec model
    word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=10, window=5, min_count=1, workers=4)

    return word2vec_model

#def apply_word2vec(sentence : str)->list:
#    "uses word2vec to vectorize a sentence"
#    word_vectors = [word2vec_model.wv[word] for word in sentence.split() if word in word2vec_model.wv]
#    return word_vectors


def train_word2vec_model2(training_data: pd.DataFrame) -> np.ndarray:
    """Returns a 2D array of comment vectors."""
    # Konvertieren der DataFrame-Spalte in eine Liste von tokenisierten Sätzen
    tokenized_sentences = []
    for sentence in training_data["comment_text"].to_list():
        tokenized_sentences.append(sentence.split())

    # Trainiere das Word2Vec-Modell
    word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

    word_vectors = [word2vec_model.wv[word] for word in sentence.split() if word in word2vec_model.wv]

    print(word_vectors[0])
    return word_vectors

def train_word2vec_model3(training_data: pd.DataFrame) -> np.ndarray:
    """Trains the word2vec model and returns a 2D array of comment vectors."""
    
    # Konvertieren der DataFrame-Spalte in eine Liste von tokenisierten Sätzen
    tokenized_sentences = []
    for sentence in training_data["comment_text"].to_list():
        tokenized_sentences.append(sentence.split())

    # Trainiere das Word2Vec-Modell
    word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=10, window=5, min_count=1, workers=4)

    # Vektorisierung der Kommentare
    comment_vectors = []
    for sentence in tokenized_sentences:
        # Berechne den Durchschnitt der Wortvektoren für den Kommentar
        vector = np.mean([word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv], axis=0)
        if vector:  # Wenn es gültige Vektoren gibt
            vector = np.mean(vector, axis=0)
        else:
            vector = np.zeros(word2vec_model.vector_size)  # Nullvektor für leere Kommentare
        comment_vectors.append(vector)

    print(comment_vectors[0])
    print(type(comment_vectors))
    return comment_vectors

#word2vec_model = train_word2vec_model(training_data)

# APPLY WORD2VEC
#training_data['comment_text'] = training_data['comment_text'].apply(apply_word2vec)
#print(training_data)

y = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
x = train_word2vec_model2(training_data)

#print(x)
print(y)
# MultiOutputRegressor mit LinearRegression verwenden
model = MultiOutputRegressor(LinearRegression())

# Modell trainieren
model.fit(x, y)

[ 1.8169767  -0.6108579   0.16979116 -0.09903484 -0.09708491 -0.5431244
  0.4283447  -0.3792121   0.23857391 -1.2174037   1.4979075   0.56397825
  2.2040966   2.1479175   1.4826068   1.5506642   0.52407914 -1.4308376
  0.5862966  -0.766785   -0.7480898   0.2721963  -0.8174974  -0.3980106
 -0.76800126  1.3581961  -0.842817    0.5724889   1.196324    0.15859877
  2.3226752   4.214538   -0.32575265  1.115664    0.3816831   1.3138293
 -1.9307041   0.17833807 -1.4641396  -1.9683695  -1.1547443   1.1573691
  1.4563439   0.540093   -1.2101793   0.15461409  0.19593291 -0.65950453
 -0.06724482  1.3475893   0.34847748  1.4955986  -0.6007019  -1.0355235
  0.2700083   0.00491568 -0.2537499   0.23817182 -0.39250898  1.1629595
 -0.52304244  0.5311758   1.3853235   1.3904791   0.5483542  -1.5834527
  0.730506    2.0408247  -1.4593054   1.1617817  -0.16343307 -1.4585619
  0.4541922   0.43012965 -1.1717057  -0.01722904  1.2939589  -0.9045109
 -1.56305    -0.87455106  0.32462138  0.26582918 -1.158273   

ValueError: Found input variables with inconsistent numbers of samples: [18, 159571]

In [42]:
"""TRAIN LOGISTIC REGRESSION MODEL WITH WORD2VEC VECTORIZATION AND SPLITTING THE TRAINING DATA"""
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Tokenisieren der Texte (jeder Kommentar in Wörter zerlegen)
def tokenize(text):
    return word_tokenize(text.lower())

# Funktion, um den Vektor eines Textes zu berechnen (durch Mittelwert der Wortvektoren)
def vectorize_text(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Falls keine Wörter im Modell vorkommen, Null-Vektor zurückgeben
    return np.mean(word_vectors, axis=0)

# Text und Labels trennen
y = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X = training_data['comment_text']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenisierung auf alle Kommentare anwenden
X_train_tokens = X_train.apply(tokenize)
X_test_tokens = X_test.apply(tokenize)

# Word2Vec-Modell trainieren
model_w2v = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Texte vektorisieren
X_train_vec = np.array([vectorize_text(tokens, model_w2v) for tokens in X_train_tokens])
X_test_vec = np.array([vectorize_text(tokens, model_w2v) for tokens in X_test_tokens])

# Multiklassige Klassifikation (eine Klassifikation für jede Kategorie)
model_lr_w2v = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model_lr_w2v.fit(X_train_vec, y_train)

# Vorhersagen machen
y_pred = model_lr_w2v.predict(X_test_vec)

# Performance auswerten
for i, column in enumerate(y.columns):
    print(f'Accuracy für {column}: {accuracy_score(y_test[column], y_pred[:, i]):.4f}')


Accuracy für toxic: 0.9407
Accuracy für severe_toxic: 0.9898
Accuracy für obscene: 0.9669
Accuracy für threat: 0.9975
Accuracy für insult: 0.9621
Accuracy für identity_hate: 0.9905


In [45]:
"""Test model_lr_w2v with an example comment"""

def get_output_from_model_lr_w2v(model, model_vec, comment)->str:
    input = [vectorize_text(tokenize(comment),model_vec)]
    output = model.predict(input)
    print(output)
    return output

example_comment = "You are low key stupid"
get_output_from_model_lr_w2v(model_lr_w2v, model_w2v, example_comment)

[[1 0 0 0 0 0]]


In [17]:
"""TRAIN SVM WITH WORD2VEC VECTORIZATION AND SPLITTING THE TRAINING DATA"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

def tokenize(text):
    return word_tokenize(text.lower())

def vectorize_text(tokens, model):
    "Use word2vec model for vectorization"
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Falls keine Wörter im Modell vorkommen, Null-Vektor zurückgeben
    return np.mean(word_vectors, axis=0)

# Text und Labels trennen
y = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X = training_data['comment_text']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenisierung auf alle Kommentare anwenden
X_train_tokens = X_train.apply(tokenize)
X_test_tokens = X_test.apply(tokenize)

# Word2Vec-Modell trainieren
model_w2v = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Texte vektorisieren
X_train_vec = np.array([vectorize_text(tokens, model_w2v) for tokens in X_train_tokens])
X_test_vec = np.array([vectorize_text(tokens, model_w2v) for tokens in X_test_tokens])

# Multiklassige Klassifikation mit SVM (eine Klassifikation für jede Kategorie)
model_svm_w2v = MultiOutputClassifier(SVC(kernel='linear', C=1))  # SVM mit linearem Kernel und Regularisierung C=1
model_svm_w2v.fit(X_train_vec, y_train)

# Vorhersagen machen
y_pred = model_svm_w2v.predict(X_test_vec)

# Performance auswerten
for i, column in enumerate(y.columns):
    print(f'Accuracy für {column}: {accuracy_score(y_test[column], y_pred[:, i]):.4f}')


Accuracy für toxic: 0.9407
Accuracy für severe_toxic: 0.9899
Accuracy für obscene: 0.9674
Accuracy für threat: 0.9977
Accuracy für insult: 0.9625
Accuracy für identity_hate: 0.9908


In [None]:
"""Test model_svm_w2v with an example comment"""

def get_output_from_model_swm_w2v(model, model_vec, comment)->str:
    input = [vectorize_text(tokenize(comment),model_vec)]
    output = model.predict(input)
    print(output)
    return output

example_comment = "You are low key stupid"
get_output_from_model_swm_w2v(model_svm_w2v, model_w2v, example_comment)

NameError: name 'model_svm_w2v' is not defined

In [48]:
"""TRAIN LOGISTIC REGRESSION MODEL WITH TF-IDF VECTORIZATION"""
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report


def create_tfidf_vector(data : pd.core.frame.DataFrame)->any:
    "Creates the tf-idf vector and returns it"
    vectorizer = TfidfVectorizer(max_features=15000, min_df=2, max_df=0.8)
    tfidf_matrix = vectorizer.fit_transform(data["comment_text"].to_list())

    return tfidf_matrix

# Schritt 1: Vektorisierung der Kommentare mit TF-IDF
X = create_tfidf_vector(training_data)

# Schritt 2: Zielvariablen definieren
y = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Schritt 3: Datensatz aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Schritt 4: Erstelle und trainiere das Modell
model_lr_tfidf = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Trainiere das Modell
model_lr_tfidf.fit(X_train, y_train)

# Schritt 5: Vorhersagen treffen
y_pred = model_lr_tfidf.predict(X_test)

# Schritt 6: Evaluierung der Modellleistung
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))


Classification Report:
                precision    recall  f1-score   support

        toxic       0.91      0.62      0.74      3056
 severe_toxic       0.59      0.26      0.36       321
      obscene       0.91      0.64      0.75      1715
       threat       0.64      0.12      0.20        74
       insult       0.82      0.51      0.63      1614
identity_hate       0.74      0.16      0.26       294

    micro avg       0.88      0.56      0.68      7074
    macro avg       0.77      0.38      0.49      7074
 weighted avg       0.87      0.56      0.67      7074
  samples avg       0.06      0.05      0.05      7074



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [52]:
"""Test model_lr_tfidf with an example comment"""
def get_output_from_model_lr_tfidf(model_lr_tfidf, data : pd.core.frame.DataFrame, comment)->any:
    "Creates the tf-idf vector and returns it"
    vectorizer = TfidfVectorizer(max_features=15000, min_df=2, max_df=0.8)
    tfidf_vectorizer = vectorizer.fit(data["comment_text"].to_list())
    tfidf_matrix_comment = tfidf_vectorizer.transform([comment])

    output = model_lr_tfidf.predict(tfidf_matrix_comment)
    print(output)
    return output

# Example usage:
comment = "You're so stupid and worthless!"
result = get_output_from_model_lr_tfidf(model_lr_tfidf, training_data , comment)


[[1 0 0 0 0 0]]


In [None]:
"""TRAIN SVM MODEL WITH TF-IDF VECTORIZATION"""
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report

def create_tfidf_vector(data: pd.core.frame.DataFrame) -> any:
    "Creates the tf-idf vector and returns it"
    vectorizer = TfidfVectorizer(max_features=15000, min_df=2, max_df=0.8)
    tfidf_matrix = vectorizer.fit_transform(data["comment_text"].to_list())

    return tfidf_matrix

# Step 1: Vectorization of comments using TF-IDF
X = create_tfidf_vector(training_data)

# Step 2: Define the target variables
y = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Step 3: Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Create and train the model
model_svm_tfidf = MultiOutputClassifier(SVC(kernel='linear', C=1, random_state=42))

# Train the model
model_svm_tfidf.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model_svm_tfidf.predict(X_test)

# Step 6: Evaluate the model's performance
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))


In [53]:
"""Test model_svm_tfidf with an example comment"""
def get_output_from_model_svm_tfidf(model_svm_tfidf, data : pd.core.frame.DataFrame, comment)->any:
    "Creates the tf-idf vector and returns it"
    vectorizer = TfidfVectorizer(max_features=15000, min_df=2, max_df=0.8)
    tfidf_vectorizer = vectorizer.fit(data["comment_text"].to_list())
    tfidf_matrix_comment = tfidf_vectorizer.transform([comment])

    output = model_svm_tfidf.predict(tfidf_matrix_comment)
    print(output)
    return output

# Example usage:
comment = "You're so stupid and worthless!"
result = get_output_from_model_svm_tfidf(model_svm_tfidf, training_data , comment)


NameError: name 'model_svm_tfidf' is not defined

In [22]:
"""TRAIN LOGISTIC REGRESSION MODEL WITH WORD2VEC VECTORIZATION"""
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Funktion, um den Vektor eines Textes zu berechnen (durch Mittelwert der Wortvektoren)
def vectorize_text(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Falls keine Wörter im Modell vorkommen, Null-Vektor zurückgeben
    return np.mean(word_vectors, axis=0)

# Tokenisieren der Texte (jeder Kommentar in Wörter zerlegen)
def tokenize(text):
    return word_tokenize(text.lower())

# Text und Labels trennen
y_train = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_train = training_data['comment_text']


# Tokenisierung auf alle Kommentare anwenden
X_train_tokens = X_train.apply(tokenize)

# Word2Vec-Modell trainieren
model_w2v = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Texte vektorisieren
X_train_vec = np.array([vectorize_text(tokens, model_w2v) for tokens in X_train_tokens])

# Multiklassige Klassifikation (eine Klassifikation für jede Kategorie)
model = MultiOutputClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_vec, y_train)


In [None]:
"""TRAIN SUPPORT VECTOR MACHINE WITH WORD2VEC VECTORIZATION"""
from sklearn.svm import SVC

# Funktion, um den Vektor eines Textes zu berechnen (durch Mittelwert der Wortvektoren)
def vectorize_text(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)  # Falls keine Wörter im Modell vorkommen, Null-Vektor zurückgeben
    return np.mean(word_vectors, axis=0)

# Tokenisieren der Texte (jeder Kommentar in Wörter zerlegen)
def tokenize(text):
    return word_tokenize(text.lower())

# Text und Labels trennen
y_train = training_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_train = training_data['comment_text']


# Tokenisierung auf alle Kommentare anwenden
X_train_tokens = X_train.apply(tokenize)

# Word2Vec-Modell trainieren
model_w2v = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Texte vektorisieren
X_train_vec = np.array([vectorize_text(tokens, model_w2v) for tokens in X_train_tokens])

# Multiklassige Klassifikation mit SVM (eine Klassifikation für jede Kategorie)
svm_model_w2v = MultiOutputClassifier(SVC(kernel='linear', C=1))  # SVM mit linearem Kernel und Regularisierung C=1
svm_model_w2v.fit(X_train_vec, y_train)

