In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import spacy
import re
import yake
from textblob import TextBlob
from autocorrect import Speller
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
nlp = spacy.load('pt_core_news_sm')
spell = Speller()

In [None]:
def remove_stopwords(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not re.match(r'^[@¦\x9f\x92â\x80]+$', token.text)]
    filtered_text = ' '.join(tokens)
    return filtered_text


In [None]:
def tokenizar(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [None]:
def lemmatizar_tokens(tokens):
    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    lemmatized_tokens = [token.lemma_ for token in nlp(doc)]
    return lemmatized_tokens

In [None]:
def correct_spelling(text):
    corrected_text = spell(text)
    return corrected_text

In [None]:
def correct_grammar(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

In [None]:
def correct_text(text):
    text = correct_spelling(text)
    text = correct_grammar(text)
    return text

In [None]:
def preprocess_data(data_path):
    df = pd.read_csv(data_path)
    df.dropna(inplace=True)
    df['text_processed'] = df['Texto'].apply(lambda x: remove_stopwords(x))
    df['label'] = df['Sentimento'].map({'feliz': 0, 'nojo': 1, 'triste': 2, 'medo': 3, 'raiva': 4})
    return df['text_processed'], df['label']

In [None]:
def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    clf = MultinomialNB()
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    plot_confusion_matrix(y_test, y_pred)  # Chamar a função corrigida de plotagem
    accuracy = accuracy_score(y_test, y_pred)
    print("Acurácia do modelo:", accuracy)
    return clf, vectorizer, accuracy


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_test, y_pred):
    labels = ['feliz', 'nojo']
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.title('Matriz de Confusão')
    plt.show()


In [None]:
def plot_keyword_scores(keywords):
    keywords.sort(key=lambda x: x[1])
    keywords_list, scores_list = zip(*keywords)
    plt.figure(figsize=(20, 10))
    plt.bar(keywords_list, scores_list)
    plt.xlabel("Palavras-chave")
    plt.ylabel("Scores")
    plt.title("Scores de Palavras-chave")
    plt.xticks(rotation=45)
    plt.show()

In [None]:
def extract_keywords(X, y, label, language='pt', n=1, k=20):
    data_label = X[y == label]
    data_label = ' '.join(data_label.astype(str))
    keyword_extractor = yake.KeywordExtractor(lan=language, n=n, top=k)
    keywords = keyword_extractor.extract_keywords(data_label)
    keywords.sort()
    return keywords

In [None]:
X, y = preprocess_data('tweets_ekman.csv')
model, vectorizer, accuracy = train(X, y)

In [None]:
keywords0 = extract_keywords(X, y, label=0)
plot_keyword_scores(keywords0)

keywords1 = extract_keywords(X, y, label=1)
plot_keyword_scores(keywords1)