<a href="https://colab.research.google.com/github/thavyne-KDR/Detec-o_de_Ironia_em_Textos/blob/main/Irony_Classification_TF-IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q scikit-learn pandas numpy

from google.colab import files
uploaded = files.upload()

import zipfile, os

zip_path = next(iter(uploaded.keys()))
extract_path = "dados_ironia"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_path)

os.listdir(extract_path)

import pandas as pd

train = pd.read_json(f"{extract_path}/train.jsonl", lines=True)
test  = pd.read_csv(f"{extract_path}/test.csv")
sample = pd.read_csv(f"{extract_path}/sample_submission.csv")

print("Train:", train.shape, train.columns.tolist())
print("Test :", test.shape , test.columns.tolist())
print("Sample submission:", sample.columns.tolist())

display(train.head(3))
display(test.head(3))
display(sample.head(3))

from sklearn.model_selection import train_test_split

X_train_text, X_val_text, y_train, y_val = train_test_split(
    train['text'], train['label'], test_size=0.2, random_state=42
)

print("Train:", X_train_text.shape, y_train.shape)
print("Validation:", X_val_text.shape, y_val.shape)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(X_train_text)
X_val = tfidf.transform(X_val_text)

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import balanced_accuracy_score

modelos = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "LinearSVC": LinearSVC(class_weight="balanced"),
    "MultinomialNB": MultinomialNB()
}

resultados = []
for nome, mdl in modelos.items():
    mdl.fit(X_train, y_train)
    pred = mdl.predict(X_val)
    bal_acc = balanced_accuracy_score(y_val, pred)
    resultados.append((nome, bal_acc))
    print(f"{nome}: Balanced Accuracy = {bal_acc:.4f}")

resultados.sort(key=lambda x: x[1], reverse=True)
melhor_nome, melhor_score = resultados[0]
print("\nMelhor modelo:", melhor_nome, "| Balanced Accuracy:", round(melhor_score, 4))

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

modelos["RandomForestClassifier"] = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
modelos["GradientBoostingClassifier"] = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
modelos["AdaBoostClassifier"] = AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)

resultados = []
for nome, mdl in modelos.items():
    mdl.fit(X_train, y_train)
    pred = mdl.predict(X_val)
    bal_acc = balanced_accuracy_score(y_val, pred)
    resultados.append((nome, bal_acc))
    print(f"{nome}: Balanced Accuracy = {bal_acc:.4f}")

resultados.sort(key=lambda x: x[1], reverse=True)
melhor_nome, melhor_score = resultados[0]
print("\nMelhor modelo:", melhor_nome, "| Balanced Accuracy:", round(melhor_score, 4))


from sklearn.model_selection import GridSearchCV

param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'loss': ['hinge', 'squared_hinge'],
    'penalty': ['l2']
}

param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['liblinear', 'saga']
}

grid_search_svc = GridSearchCV(LinearSVC(class_weight="balanced"), param_grid_svc, cv=3, scoring='balanced_accuracy', n_jobs=-1)
grid_search_svc.fit(X_train, y_train)

best_svc_model = grid_search_svc.best_estimator_
svc_tuned_pred = best_svc_model.predict(X_val)
svc_tuned_bal_acc = balanced_accuracy_score(y_val, svc_tuned_pred)

print(f"Tuned LinearSVC: Best Parameters = {grid_search_svc.best_params_}")
print(f"Tuned LinearSVC: Balanced Accuracy = {svc_tuned_bal_acc:.4f}")

grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000, class_weight="balanced"), param_grid_lr, cv=3, scoring='balanced_accuracy', n_jobs=-1)
grid_search_lr.fit(X_train, y_train)

best_lr_model = grid_search_lr.best_estimator_
lr_tuned_pred = best_lr_model.predict(X_val)
lr_tuned_bal_acc = balanced_accuracy_score(y_val, lr_tuned_pred)

print(f"Tuned LogisticRegression: Best Parameters = {grid_search_lr.best_params_}")
print(f"Tuned LogisticRegression: Balanced Accuracy = {lr_tuned_bal_acc:.4f}")

!pip install -q transformers sentence-transformers

from sentence_transformers import SentenceTransformer
import numpy as np

model_name = 'paraphrase-MiniLM-L3-v2'
sbert_model = SentenceTransformer(model_name)

print("Gerando embeddings para os dados de treinamento...")
X_train_embeddings = sbert_model.encode(X_train_text.tolist(), show_progress_bar=True)

print("Gerando embeddings para os dados de validação...")
X_val_embeddings = sbert_model.encode(X_val_text.tolist(), show_progress_bar=True)

print("\nDimensões dos embeddings de treinamento:", X_train_embeddings.shape)
print("Dimensões dos embeddings de validação:", X_val_embeddings.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import balanced_accuracy_score

modelos_embeddings = {
    "LogisticRegression_Embeddings": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "LinearSVC_Embeddings": LinearSVC(class_weight="balanced"),
    "RandomForestClassifier_Embeddings": RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced"),
    "GradientBoostingClassifier_Embeddings": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42),
    "AdaBoostClassifier_Embeddings": AdaBoostClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
}

resultados_embeddings = []
for nome, mdl in modelos_embeddings.items():
    print(f"Treinando {nome}...")
    mdl.fit(X_train_embeddings, y_train)
    pred = mdl.predict(X_val_embeddings)
    bal_acc = balanced_accuracy_score(y_val, pred)
    resultados_embeddings.append((nome, bal_acc))
    print(f"{nome}: Balanced Accuracy = {bal_acc:.4f}\n")

resultados_embeddings.sort(key=lambda x: x[1], reverse=True)
melhor_nome_embeddings, melhor_score_embeddings = resultados_embeddings[0]
print("\nMelhor modelo com Embeddings:", melhor_nome_embeddings, "| Balanced Accuracy:", round(melhor_score_embeddings, 4))

import string

def count_punctuation(text):
    return sum([1 for char in text if char in string.punctuation])

train['text_length'] = train['text'].apply(len)
test['text_length'] = test['text'].apply(len)

train['punctuation_count'] = train['text'].apply(count_punctuation)
test['punctuation_count'] = test['text'].apply(count_punctuation)

display(train.head())
display(test.head())

from scipy.sparse import hstack

X_train_numerical = train.loc[X_train_text.index, ['text_length', 'punctuation_count']]
X_val_numerical = train.loc[X_val_text.index, ['text_length', 'punctuation_count']]
X_test_numerical = test[['text_length', 'punctuation_count']]

X_train_combined = hstack([X_train, X_train_numerical.values])
X_val_combined = hstack([X_val, X_val_numerical.values])
X_test_combined = hstack([X_test, X_test_numerical.values])

print("Dimensões do conjunto de treinamento combinado:", X_train_combined.shape)
print("Dimensões do conjunto de validação combinado:", X_val_combined.shape)
print("Dimensões do conjunto de teste combinado:", X_test_combined.shape)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import balanced_accuracy_score

modelos_combinados = {
    "LogisticRegression_Combined": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "LinearSVC_Combined": LinearSVC(class_weight="balanced")
}

resultados_combinados = []
for nome, mdl in modelos_combinados.items():
    print(f"Treinando {nome} com características combinadas...")
    mdl.fit(X_train_combined, y_train)
    pred = mdl.predict(X_val_combined)
    bal_acc = balanced_accuracy_score(y_val, pred)
    resultados_combinados.append((nome, bal_acc))
    print(f"{nome}: Balanced Accuracy = {bal_acc:.4f}\n")

resultados_combinados.sort(key=lambda x: x[1], reverse=True)
melhor_nome_combinado, melhor_score_combinado = resultados_combinados[0]
print("\nMelhor modelo com características combinadas:", melhor_nome_combinado, "| Balanced Accuracy:", round(melhor_score_combinado, 4))

from sklearn.model_selection import StratifiedKFold, cross_val_score

best_model_for_cv = best_lr_model

X_full_tfidf = tfidf.fit_transform(train["text"])
y_full = train["label"]

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print(f"Realizando validação cruzada ({n_splits} folds) para {type(best_model_for_cv).__name__}...")
cv_scores = cross_val_score(best_model_for_cv, X_full_tfidf, y_full, cv=skf, scoring='balanced_accuracy', n_jobs=-1)

print(f"\nAcurácia Balanceada para cada fold: {cv_scores}")
print(f"Média da Acurácia Balanceada na validação cruzada: {cv_scores.mean():.4f}")
print(f"Desvio padrão da Acurácia Balanceada na validação cruzada: {cv_scores.std():.4f}")

from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('lr', best_lr_model), ('svc', best_svc_model)],
    voting='hard',
    weights=[1, 1],
    n_jobs=-1
)

print("Treinando o Voting Classifier...")
voting_clf.fit(X_full_tfidf, y_full)

print("Avaliando o Voting Classifier no conjunto de validação...")
voting_pred = voting_clf.predict(X_val)
voting_bal_acc = balanced_accuracy_score(y_val, voting_pred)

print(f"\nVoting Classifier (LR + SVC): Balanced Accuracy = {voting_bal_acc:.4f}")

!pip install -q tensorflow

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

num_words = 10000
tokenizer = Tokenizer(num_words=num_words, oov_token="<OOV>")

print("Ajustando o tokenizador aos dados de treinamento...")
tokenizer.fit_on_texts(X_train_text)

print("Convertendo textos em sequências...")
train_sequences = tokenizer.texts_to_sequences(X_train_text)
val_sequences = tokenizer.texts_to_sequences(X_val_text)

train_text_lengths = [len(text.split()) for text in X_train_text]
max_len = int(np.mean(train_text_lengths) + 2 * np.std(train_text_lengths))
print(f"Comprimento máximo definido para as sequências: {max_len}")

print("Preenchendo as sequências...")
X_train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
X_val_padded = pad_sequences(val_sequences, maxlen=max_len, padding='post', truncating='post')

print("\nDimensões das sequências de treinamento preenchidas:", X_train_padded.shape)
print("Dimensões das sequências de validação preenchidas:", X_val_padded.shape)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

vocab_size = len(tokenizer.word_index) + 1
print(f"Tamanho do vocabulário: {vocab_size}")

embedding_dim = 100

print(f"Dimensão dos embeddings: {embedding_dim}")
print(f"Comprimento máximo da sequência: {max_len}")

model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len))

model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.summary()

def balanced_accuracy(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred > 0.5, tf.float32)

    true_positives = tf.reduce_sum(y_true * y_pred)
    true_negatives = tf.reduce_sum((1 - y_true) * (1 - y_pred))
    false_positives = tf.reduce_sum((1 - y_true) * y_pred)
    false_negatives = tf.reduce_sum(y_true * (1 - y_pred))

    sensitivity = true_positives / (true_positives + false_negatives + tf.keras.backend.epsilon())
    specificity = true_negatives / (true_negatives + false_positives + tf.keras.backend.epsilon())

    return (sensitivity + specificity) / 2

model.compile(optimizer=Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=[balanced_accuracy])

print("Modelo compilado com sucesso.")

epochs = 20
batch_size = 32

early_stopping = EarlyStopping(monitor='val_balanced_accuracy', patience=5, mode='max', restore_best_weights=True)

print(f"Iniciando o treinamento por até {epochs} épocas com batch size {batch_size} e Early Stopping...")

history = model.fit(X_train_padded, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_val_padded, y_val),
                    callbacks=[early_stopping],
                    verbose=1)

print("\nTreinamento concluído.")

print("Avaliando o modelo de Deep Learning no conjunto de validação...")
loss, bal_acc_dl = model.evaluate(X_val_padded, y_val, verbose=0)

print(f"\nDeep Learning Model: Balanced Accuracy = {bal_acc_dl:.4f}")

from google.colab import files

print("Preparando os dados combinando headline e text...")

train['input_text'] = train['headline'] + ' [SEP] ' + train['text']
test['input_text'] = test['text']

train['input_text'] = train['input_text'].fillna('')
test['input_text'] = test['input_text'].fillna('')

y_full = train['label']

print("Dados preparados com sucesso!")
print("\nExemplo do input de treino combinado:")
print(train['input_text'].iloc[0])

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("Treinando o Modelo 1: Regressão Logística...")

tfidf = TfidfVectorizer(max_features=5000)
X_full_tfidf = tfidf.fit_transform(train['input_text'])
X_test_tfidf = tfidf.transform(test['input_text'])

best_lr_model = LogisticRegression(max_iter=1000, class_weight="balanced", C=100, penalty='l2', solver='liblinear')

best_lr_model.fit(X_full_tfidf, y_full)

preds_lr_proba = best_lr_model.predict_proba(X_test_tfidf)[:, 1]

print("Modelo 1 treinado e previsões de probabilidade geradas!")
print("Shape das previsões:", preds_lr_proba.shape)

preds_lr_class = (preds_lr_proba > 0.5).astype(int)
submission_lr = pd.DataFrame({'id': test['id'], 'label': preds_lr_class})
submission_lr.to_csv('submission_logistic_regression_only.csv', index=False)
files.download('submission_logistic_regression_only.csv')


extract_path = "dados_ironia"
train = pd.read_json(f"{extract_path}/train.jsonl", lines=True)
test = pd.read_csv(f"{extract_path}/test.csv")

best_lr_model_full_train = LogisticRegression(max_iter=1000, class_weight="balanced", C=100, penalty='l2', solver='liblinear')

tfidf_full = TfidfVectorizer(max_features=5000)
X_full_tfidf = tfidf_full.fit_transform(train['text'])

y_full = train['label']
print("Treinando o melhor modelo (Logistic Regression com TF-IDF sintonizado) em todo o conjunto de dados de treinamento...")
best_lr_model_full_train.fit(X_full_tfidf, y_full)

print("Treinamento concluído.")

X_test_tfidf = tfidf_full.transform(test['text'].fillna(''))

preds = best_lr_model_full_train.predict(X_test_tfidf)

print("Previsões geradas para o conjunto de teste.")

submission = pd.DataFrame()

submission['id'] = test['id']

submission['label'] = preds.astype(int)

submission.to_csv("submission.csv", index=False)

display(submission.head())