In [1]:
import numpy as np 
import pandas as pd 
import re
import os
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

2024-03-31 07:07:44.562640: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-31 07:07:44.562737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-31 07:07:44.695900: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
!pip install -U transformers

In [2]:
from shutil import copyfile

copyfile(src = "/kaggle/input/nitro-nlp-satire/random_seed_setter.py", dst = "/kaggle/working/seed_setter.py")

from seed_setter import *

In [3]:
set_random_seeds()

NumPy random seed set with value: 42
TensorFlow random seed set with value: 42
PyTorch random seed set with value: 42


In [None]:
train_df = pd.read_csv("/kaggle/input/nitro-nlp-satire/train.csv")
test_df = pd.read_csv("/kaggle/input/nitro-nlp-satire/test.csv")


In [None]:
CLEANER = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
PUNCT_REMOVE = re.compile('[^\w\s]|_')
def clean_text(text):
   if isinstance(text, str):
        text = CLEANER.sub('', text)
        text = PUNCT_REMOVE.sub('', text)
        text = text.lower()
        # Remove leading and trailing whitespaces
        text = text.strip()
        # Remove extra spaces
        text = re.sub(' +', ' ', text)
        # correct the ș/ț diacritics before feeding it to the model 
        text = text.replace("ţ", "ț").replace("ş", "ș").replace("Ţ", "Ț").replace("Ş", "Ș")
        return text
   else:  
        return ""

In [None]:
#clean text
train_df['title'] = train_df['title'].apply(clean_text)
train_df['content'] = train_df['content'].apply(clean_text)
test_df['title'] = test_df['title'].apply(clean_text)
test_df['content'] = test_df['content'].apply(clean_text)


In [None]:
#concatenate title & content columns
train_df['input'] = train_df['title'].astype(str) + '  ' + train_df['content'].astype(str)
test_df['input'] = test_df['title'].astype(str) + '  ' + test_df['content'].astype(str)


In [None]:
#get features & labels
X = train_df['input']  
y = train_df['class']
X_final_test = test_df['input']

In [None]:
#create train, validation & test split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size =0.2, stratify=y)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train)


In [None]:
# initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = AutoModel.from_pretrained("distilbert-base-multilingual-cased")


In [None]:
#function to generate the embeddings for the sentences
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
def get_embeddings(sentences):
    batch_size = 64  
    # list to store embeddings
    all_embeddings = []
    for i in range(0, len(sentences), batch_size):
        # tokenize the batch of sentences
        inputs = tokenizer(sentences[i:i+batch_size], padding=True, truncation=True, return_tensors="pt", max_length=512)
        # move inputs to gpu
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        # get embeddings from the last hidden state
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().cpu()
        all_embeddings.append(embeddings)

    all_embeddings_tensor = torch.cat(all_embeddings, dim=0)
    return all_embeddings_tensor


In [None]:
#get embeddings for train & save them
sentences_train = X_train.tolist()
embeddings_train = get_embeddings(sentences_train)
embeddings_train = embeddings_train.numpy()
y_train_numeric = np.array(y_train, dtype=int)
np.save("embeddings_train.npy", embeddings_train)

In [None]:
#train random forest classifier
train_random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
train_random_forest.fit(embeddings_train, y_train_numeric)

In [None]:
#get embeddings for validation & test set & save them
sentences_test = X_test.tolist()
y_test_numeric = np.array(y_test, dtype=int)
embeddings_test = get_embeddings(sentences_test)

sentences_val = X_val.tolist()
y_val_numeric = np.array(y_val, dtype=int)
embeddings_val = get_embeddings(sentences_val)

embeddings_test = embeddings_test.numpy()
np.save("embeddings_test.npy", embeddings_test)
embeddings_val = embeddings_val.numpy()
np.save("embeddings_val.npy", embeddings_val)

In [None]:
#concatenate validation & test 
y_concat_test = np.concatenate((y_val_numeric,y_test_numeric))
embeddings_concat_test = np.concatenate((embeddings_val,embeddings_test))


In [None]:
#save labels
np.save("y_test_numeric.npy", y_test_numeric)
np.save("y_train_numeric.npy", y_train_numeric)
np.save("y_val_numeric.npy", y_val_numeric)
np.save("y_concat_test.npy", y_concat_test)

In [None]:
#make predictions on the validation&test concatenated
y_pred = train_random_forest.predict(embeddings_concat_test)
accuracy = accuracy_score(y_concat_test, y_pred)
precision = precision_score(y_concat_test, y_pred)
recall = recall_score(y_concat_test, y_pred)
balanced_accuracy_score = balanced_accuracy_score(y_concat_test, y_pred)
print("balanced acc:", balanced_accuracy_score)
print("acc:", accuracy)
print("precision:", precision)
print("recall", recall)

In [None]:
#save model
from joblib import dump
model_save_path = '/kaggle/working/random_forest_model.joblib'
dump(train_random_forest, model_save_path)

In [None]:
#get the embeddings for the test examples
sentences_final_test = X_final_test.tolist()
embeddings_final_test = get_embeddings(sentences_final_test)

In [None]:
#convert embeddings for test examples to np array & save them 
embeddings_final_test = embeddings_final_test.numpy()
np.save("embeddings_final_test.npy", embeddings_final_test)

In [None]:
#make predictions for test examples
y_pred_final = train_random_forest.predict(embeddings_final_test)

In [None]:
#save predictions to csv
ids = list(range(len(y_pred_final)))  

df_final_predictions = pd.DataFrame({
    'id': ids,
    'class': y_pred_final
})

df_final_predictions.to_csv('/kaggle/working/test_predictions.csv', index=False)


In [None]:
#train a logistic regression classifier
classifier = LogisticRegression()
classifier.fit(embeddings_train,y_train_numeric)
predictions = classifier.predict(embeddings_concat_test)
accuracy = accuracy_score(y_concat_test, y_pred)
precision = precision_score(y_concat_test, y_pred)
recall = recall_score(y_concat_test, y_pred)
balanced_accuracy_score = balanced_accuracy_score(y_concat_test, y_pred)
print("balanced acc:", balanced_accuracy_score)
print("acc:", accuracy)
print("precision:", precision)
print("recall", recall)

In [None]:
#save model
from joblib import dump
model_save_path = '/kaggle/working/logistic_reg_model.joblib'
dump(classifier, model_save_path)

In [None]:
y_pred_final_logistic_reg = classifier.predict(embeddings_final_test)

In [None]:
df_final_predictions_logistic_reg = pd.DataFrame({
    'id': ids,
    'class': y_pred_final_logistic_reg
})

df_final_predictions_logistic_reg.to_csv('/kaggle/working/test_predictions_logistic_reg.csv', index=False)

In [None]:
accuracy = accuracy_score(y_concat_test, predictions)
precision = precision_score(y_concat_test, predictions)
recall = recall_score(y_concat_test, predictions)
balanced_accuracy_score = balanced_accuracy_score(y_concat_test, predictions)
print("balanced acc:", balanced_accuracy_score)
print("acc:", accuracy)
print("precision:", precision)
print("recall", recall)from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

svm_classifier = make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True))

svm_classifier.fit(embeddings_train,y_train_numeric)

predictions = svm_classifier.predict(embeddings_concat_test)
accuracy = accuracy_score(y_concat_test, predictions)
precision = precision_score(y_concat_test, predictions)
recall = recall_score(y_concat_test, predictions)
balanced_accuracy_score = balanced_accuracy_score(y_concat_test, predictions)
print("balanced acc:", balanced_accuracy_score)
print("acc:", accuracy)
print("precision:", precision)
print("recall", recall)


In [None]:
#save model
from joblib import dump
model_save_path = '/kaggle/working/svm_classifier_model.joblib'
dump(svm_classifier, model_save_path)

In [None]:
y_pred_final_svm = svm_classifier.predict(embeddings_final_test)
df_final_predictions_svm = pd.DataFrame({
    'id': ids,
    'class': y_pred_final_svm
})

df_final_predictions_svm.to_csv('/kaggle/working/test_predictions_svm.csv', index=False)


In [4]:
embeddings_train = np.load("/kaggle/input/embeddings-1/embeddings.npy")
embeddings_val = np.load("/kaggle/input/embeddings-1/embeddings_val.npy")
embeddings_test = np.load("/kaggle/input/embeddings-1/embeddings_test.npy")
embeddings_final_test = np.load("/kaggle/input/embeddings-1/embeddings_final_test.npy")
y_train_numeric = np.load("/kaggle/input/embeddings-1/y_train_numeric.npy")
y_val_numeric = np.load("/kaggle/input/embeddings-1/y_val_numeric.npy")
y_test_numeric = np.load("/kaggle/input/embeddings-1/y_test_numeric.npy")

In [5]:
y_concat_train = np.concatenate((y_train_numeric,y_val_numeric,y_test_numeric))
embeddings_concat_train = np.concatenate((embeddings_train, embeddings_val,embeddings_test))

In [14]:

input_shape = embeddings_train.shape[1]  
model_nn = Sequential([
    Dense(256, activation='relu', input_shape=(input_shape,)),
    Dropout(0.25),  
    Dense(128, activation='relu'),
    Dropout(0.2),  
    Dense(1, activation='sigmoid')
])
model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model_nn.fit(embeddings_train, y_train_numeric, epochs=10, validation_data=(embeddings_val, y_val_numeric))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m  92/1500[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 2ms/step - accuracy: 0.7807 - loss: 0.4537

I0000 00:00:1711855775.217087      86 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.9037 - loss: 0.2388 - val_accuracy: 0.9332 - val_loss: 0.1667
Epoch 2/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9390 - loss: 0.1652 - val_accuracy: 0.9386 - val_loss: 0.1546
Epoch 3/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9447 - loss: 0.1532 - val_accuracy: 0.9440 - val_loss: 0.1478
Epoch 4/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9484 - loss: 0.1462 - val_accuracy: 0.9506 - val_loss: 0.1276
Epoch 5/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9508 - loss: 0.1365 - val_accuracy: 0.9545 - val_loss: 0.1249
Epoch 6/10
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9526 - loss: 0.1348 - val_accuracy: 0.9533 - val_loss: 0.1245
Epoch 7/10
[1m1500/1500[

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 47991
'y' sizes: 8469


In [16]:
loss, accuracy = model_nn.evaluate(embeddings_test,y_test_numeric)
print(f"Test Accuracy: {accuracy:.4f}")


[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9563 - loss: 0.1185
Test Accuracy: 0.9574


In [17]:
predicted_probabilities = model_nn.predict(embeddings_final_test)
# Convert probabilities to class labels 
threshold = 0.5
predicted_labels = (predicted_probabilities > threshold).astype(int)

[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


In [27]:
predicted_labels = predicted_labels.flatten()

ids = list(range(len(predicted_labels)))  
df_final_predictions_nn = pd.DataFrame({
    'id': ids,
    'class': predicted_labels
})

df_final_predictions_nn.to_csv('/kaggle/working/test_predictions_nn.csv', index=False)

In [6]:
from tensorflow.keras.callbacks import ModelCheckpoint
input_shape = embeddings_train.shape[1]
model_nn_2 = Sequential([
    Dense(1024, activation='relu', input_shape=(input_shape,)),
    BatchNormalization(),
    Dropout(0.4),
    Dense(1024, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),

    
    Dense(1, activation='sigmoid')
])
model_nn_2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
checkpoint_path = '/kaggle/working/nn_model_13.keras'  
checkpoint = ModelCheckpoint(checkpoint_path, monitor='accuracy', verbose=1, save_best_only=True, mode='max')
history = model_nn_2.fit(embeddings_concat_train, y_concat_train, epochs=45, callbacks=[checkpoint])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m  35/2206[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m10s[0m 5ms/step - accuracy: 0.4820 - loss: 1.2680   

I0000 00:00:1711868950.659405      81 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5716 - loss: 0.8204
Epoch 1: accuracy improved from -inf to 0.61006, saving model to /kaggle/working/nn_model_14.keras
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 12ms/step - accuracy: 0.5716 - loss: 0.8204
Epoch 2/10
[1m2202/2206[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.7058 - loss: 0.5749
Epoch 2: accuracy improved from 0.61006 to 0.79036, saving model to /kaggle/working/nn_model_14.keras
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.7060 - loss: 0.5746
Epoch 3/10
[1m2197/2206[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.8959 - loss: 0.2894
Epoch 3: accuracy improved from 0.79036 to 0.89940, saving model to /kaggle/working/nn_model_14.keras
[1m2206/2206[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.8959 - loss: 0.2893
Epoc

In [7]:
from tensorflow.keras.models import load_model

destination_path = '/kaggle/working/nn_model_14.keras'
best_model = load_model(destination_path)


In [7]:
loss, accuracy = best_model.evaluate(embeddings_test,y_test_numeric)
print(f"Test Accuracy: {accuracy:.4f}")


[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9718 - loss: 0.1121
Test Accuracy: 0.9730


In [8]:
predicted_probabilities = best_model.predict(embeddings_final_test)
threshold = 0.5
predicted_labels = (predicted_probabilities > threshold).astype(int)
predicted_labels = predicted_labels.flatten()

ids = list(range(len(predicted_labels)))  
df_final_predictions_nn = pd.DataFrame({
    'id': ids,
    'class': predicted_labels
})

df_final_predictions_nn.to_csv('/kaggle/working/test_predictions_nn_14.csv', index=False)

[1m1146/1146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


In [3]:
pred1 = pd.read_csv("/kaggle/input/predictions/prediction.csv")
pred2 = pd.read_csv("/kaggle/input/predictions/test_predictions_logistic_reg.csv")
pred3 = pd.read_csv("/kaggle/input/predictions/test_predictions_nn_7.csv")

In [4]:
predictions1 = np.array(pred1['class'])
predictions2 = np.array(pred2['class'])
predictions3 = np.array(pred3['class'])

In [6]:
combined_predictions = np.stack([predictions1, predictions2, predictions3], axis=1)
majority_vote = np.mean(combined_predictions, axis=1) > 0.5
majority_vote = majority_vote.astype(int)

0


In [7]:
ids = list(range(len(majority_vote)))  
df_final_predictions_majority = pd.DataFrame({
    'id': ids,
    'class': majority_vote
})
df_final_predictions_majority.to_csv('/kaggle/working/majority.csv', index=False)