!pip install yake
!pip install matplotlib
!pip install spacy
!pip install nltk
!python -m spacy download pt_core_news_sm

In [None]:
!pip install yake
!pip install matplotlib
!pip install spacy
!pip install nltk
!python -m spacy download pt_core_news_sm

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import yake
import re
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import nltk
import spacy
from collections import Counter
nlp = spacy.load('pt_core_news_sm')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
def remove_stopwords(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    stop_words = set(stopwords.words('portuguese'))
    stop_words.add("user")
    stop_words.add("the")
    text = re.sub(r'[^\w\s]', '', text).lower()
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words and len(word) > 1]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [None]:
def tokenizar(text):
    tokens = word_tokenize(text)
    return tokens

In [None]:
def lemmatizar_tokens(tokens):
    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    lemmatized_tokens = [token.lemma_ for token in nlp(doc)]
    return lemmatized_tokens

In [None]:
def preprocess_data(data_path):
    df = pd.read_csv(data_path)
    df.dropna(inplace=True)
    df['text_processed'] = df['text'].apply(lambda x: remove_stopwords(x))
    df['text_tokens'] = df['text_processed'].apply(lambda x: tokenizar(x))
    df['text_lemmatized'] = df['text_tokens'].apply(lambda x: lemmatizar_tokens(x))
    df['text_final'] = df['text_lemmatized'].apply(lambda tokens: ' '.join(tokens))
    return df['text_final'], df['toxic']

In [None]:
def train_and_evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=none)
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    clf = MultinomialNB()
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    print("Acurácia do modelo:", accuracy)
    return clf, vectorizer, accuracy

In [None]:
X, y = preprocess_data('comentarios_toxicos_ptBR.csv')
train_and_evaluate_model(X, y)

In [None]:
def plot_keyword_scores(keywords):
    keywords.sort(key=lambda x: x[1])
    keywords_list, scores_list = zip(*keywords)
    plt.figure(figsize=(20, 10))
    plt.bar(keywords_list, scores_list)
    plt.xlabel("Keywords")
    plt.ylabel("Scores")
    plt.title("Keyword Scores")
    plt.show()

In [None]:
def extract_keywords(data, language='pt', n=1, k=20):
    data = ' '.join(data.astype(str))
    keyword_extractor = yake.KeywordExtractor(lan=language, n=n, top=k)
    keywords = keyword_extractor.extract_keywords(data)
    keywords.sort()
    return keywords

In [None]:
keywords = extract_keywords(X)
for kw in keywords:
    print(kw)

In [None]:
plot_keyword_scores(keywords)

In [None]:
words = [word.lower() for sentence in X for word in sentence.split()]
word_freq = Counter(words)
most_common_words = word_freq.most_common(20)
for m_w in most_common_words:
    print(m_w)

In [None]:
plot_keyword_scores(most_common_words)