!pip install yake
!pip install matplotlib
!pip install spacy
!pip install nltk
!python -m spacy download pt_core_news_sm

In [1]:
#pip install yake
#pip install matplotlib
#pip install spacy
#python -m spacy download en_core_web_sm
#pip install spacy textblob autocorrect


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import yake
import re
import tensorflow as tf
import matplotlib.pyplot as plt
import spacy
from collections import Counter
from textblob import TextBlob
from autocorrect import Speller
nlp = spacy.load('en_core_web_sm')
spell = Speller()

ModuleNotFoundError: No module named 'pandas'

In [None]:
def remove_stopwords(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct and not re.match(r'^[@¦\x9f\x92â\x80]+$', token.text)]
    filtered_text = ' '.join(tokens)
    return filtered_text

In [None]:
def tokenizar(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [None]:
def lemmatizar_tokens(tokens):
    doc = spacy.tokens.Doc(nlp.vocab, words=tokens)
    lemmatized_tokens = [token.lemma_ for token in nlp(doc)]
    return lemmatized_tokens

In [None]:
def correct_spelling(text):
    corrected_text = spell(text)
    return corrected_text

In [None]:
def correct_grammar(text):
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

In [None]:
def correct_text(text):
    text = correct_spelling(text)
    text = correct_grammar(text)
    return text

In [None]:
# Check GPU availability

print("GPU Available:", tf.test.is_gpu_available())

# Check TPU availability
tpu_available = False
devices = tf.config.list_logical_devices()
for device in devices:
    if device.device_type == 'TPU':
        tpu_available = True
        break

print("TPU Available:", tpu_available)

In [None]:
def preprocess_data(data_path):
    df = pd.read_csv(data_path)
    df = df.head(10)
    df.dropna(inplace=True)
    df['text_correct'] = df['tweet'].apply(lambda x: correct_text(x))
    df['text_processed'] = df['text_correct'].apply(lambda x: remove_stopwords(x))
    df['text_tokens'] = df['text_processed'].apply(lambda x: tokenizar(x))
    df['text_lemmatized'] = df['text_tokens'].apply(lambda x: lemmatizar_tokens(x))
    df['text_final'] = df['text_lemmatized'].apply(lambda tokens: ' '.join(tokens))
    return df['text_final'], df['label']

In [None]:
def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    clf = MultinomialNB()
    clf.fit(X_train_vec, y_train)
    y_pred = clf.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    plot(y_test,y_pred)
    print("Acurácia do modelo:", accuracy)
    return clf, vectorizer, accuracy

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import metrics
def plot(y_test,y_pred):
  cm = confusion_matrix(y_test,y_pred)
  cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm,
                        display_labels = [False, True])
  cm_display.plot()
  plt.show()

In [None]:
X, y = preprocess_data('Twitter Sentiments.csv')
train(X, y)

In [None]:
def plot_keyword_scores(keywords):
    keywords.sort(key=lambda x: x[1])
    keywords_list, scores_list = zip(*keywords)
    plt.figure(figsize=(20, 10))
    plt.bar(keywords_list, scores_list)
    plt.xlabel("Keywords")
    plt.ylabel("Scores")
    plt.title("Keyword Scores")
    plt.show()

In [None]:
def extract_keywords(X, y, label, language='pt', n=1, k=20):
    data_label = X[y == label]
    data_label = ' '.join(data_label.astype(str))
    keyword_extractor = yake.KeywordExtractor(lan=language, n=n, top=k)
    keywords = keyword_extractor.extract_keywords(data_label)
    keywords.sort()
    return keywords

In [None]:
keywords0 = extract_keywords(X, y, 0)


In [None]:
plot_keyword_scores(keywords0)


In [None]:
keywords1 = extract_keywords(X, y, 1)


In [None]:
plot_keyword_scores(keywords1)