In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/glove6b100dtxt/glove.6B.100d.txt


In [2]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

In [3]:
train = train.drop(["id", "keyword", "location"], axis=1)
test = test.drop(["keyword", "location"], axis=1)

In [4]:
balance_counts = train.groupby("target").agg("count")
balance_counts

Unnamed: 0_level_0,text
target,Unnamed: 1_level_1
0,4342
1,3271


In [5]:
train.head(), test.head()

(                                                text  target
 0  Our Deeds are the Reason of this #earthquake M...       1
 1             Forest fire near La Ronge Sask. Canada       1
 2  All residents asked to 'shelter in place' are ...       1
 3  13,000 people receive #wildfires evacuation or...       1
 4  Just got sent this photo from Ruby #Alaska as ...       1,
    id                                               text
 0   0                 Just happened a terrible car crash
 1   2  Heard about #earthquake is different cities, s...
 2   3  there is a forest fire at spot pond, geese are...
 3   9           Apocalypse lighting. #Spokane #wildfires
 4  11      Typhoon Soudelor kills 28 in China and Taiwan)

### Cleaning

In [6]:
from nltk.corpus import stopwords
import re, string

stop_words = stopwords.words('english')

def remove_stopwords(text):
    return ' '.join(word for word in text.split(' ') if word not in stop_words)

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

train["text"] = train["text"].apply(remove_stopwords)
train["text"] = train["text"].apply(clean_text)

test["text"] = test["text"].apply(remove_stopwords)
test["text"] = test["text"].apply(clean_text)

train.head(), test.head()

(                                                text  target
 0   our deeds reason earthquake may allah forgive us       1
 1              forest fire near la ronge sask canada       1
 2  all residents asked shelter place notified off...       1
 3   people receive wildfires evacuation orders ca...       1
 4  just got sent photo ruby alaska smoke wildfire...       1,
    id                                               text
 0   0                   just happened terrible car crash
 1   2  heard earthquake different cities stay safe ev...
 2   3  forest fire spot pond geese fleeing across str...
 3   9              apocalypse lighting spokane wildfires
 4  11               typhoon soudelor kills  china taiwan)

### Stemming

In [7]:
from nltk import SnowballStemmer

stemmer = SnowballStemmer('english')

def stem_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

train["text"] = train["text"].apply(stem_text)
test["text"] = test["text"].apply(stem_text)

train.head(), test.head()

(                                                text  target
 0      our deed reason earthquak may allah forgiv us       1
 1               forest fire near la rong sask canada       1
 2  all resid ask shelter place notifi offic no ev...       1
 3       peopl receiv wildfir evacu order california        1
 4  just got sent photo rubi alaska smoke wildfir ...       1,
    id                                               text
 0   0                      just happen terribl car crash
 1   2      heard earthquak differ citi stay safe everyon
 2   3  forest fire spot pond gees flee across street ...
 3   9                     apocalyps light spokan wildfir
 4  11                typhoon soudelor kill  china taiwan)

### Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train["target"])

train["target"] = le.transform(train["target"])

# Naive Bayes - Tfidf Solution

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline

X, y = train["text"].to_numpy(), train["target"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("naive_bayes", ComplementNB()),
])

pipe.fit(X_train, y_train)

In [10]:
idx = 2

print(f"Text: {X_test[idx]}\nPredicted: {pipe.predict([X_test[idx]])[0]}")

Text: the latest more home raze northern california wildfir  abc news 
Predicted: 1


In [11]:
from sklearn.metrics import roc_auc_score

pred = pipe.predict(X_test)

print(f"ROC_AUC_SCORE: {roc_auc_score(y_test, pred)}")

ROC_AUC_SCORE: 0.7775856105153925


In [12]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [13]:
X_eval = test["text"]
submission_pred = pipe.predict(X_eval)

submission = test.copy()
submission["target"] = submission_pred
submission = submission.drop(["text"], axis=1)
submission.to_csv("submission.csv", index=False)

# XGBoost - Tfidf Solution

In [14]:
import xgboost as xgb

X, y = train["text"].to_numpy(), train["target"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y)

pipe = Pipeline([ 
    ('tfid', TfidfVectorizer()),  
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc',
        # colsample_bytree=0.8,
        # subsample=0.7,
        # min_child_weight=5,
    ))
])

pipe.fit(X_train, y_train)

In [15]:
idx = 27

print(f"Text: {X_test[idx]}\nPredicted: {pipe.predict([X_test[idx]])[0]}")

Text: ushiocom i may panick littl i fast submit form i usual
Predicted: 0


In [16]:
X_eval = test["text"]
submission_pred = pipe.predict(X_eval)

submission = test.copy()
submission["target"] = submission_pred
submission = submission.drop(["text"], axis=1)
submission.to_csv("submission.csv", index=False)

# Glove Super Fancy LSTM Model Training

In [17]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, GlobalMaxPool1D, Dropout, Dense, BatchNormalization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

texts = train["text"]
target = train["target"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

def embed(corpus):
    return tokenizer.texts_to_sequences(corpus)

train_padded_sentences = pad_sequences(
    embed(texts),
    padding="post"
)

vocab_length = len(tokenizer.word_index) + 1

embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions


embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        

X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    target, 
    test_size=0.25
)

def glove_lstm():
    model = Sequential()
    
    model.add(Embedding(
        input_dim=embedding_matrix.shape[0], 
        output_dim=embedding_matrix.shape[1], 
        weights = [embedding_matrix], 
    ))
    
    model.add(Bidirectional(LSTM(
        26, 
        return_sequences = True, 
        recurrent_dropout=0.2
    )))
    
    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(26, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(26, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

model = glove_lstm()
model.summary()

2025-05-03 10:25:43.943049: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746267944.161496      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746267944.231750      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
I0000 00:00:1746267973.148030      18 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [18]:
model.fit(X_train, y_train)

[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 60ms/step - accuracy: 0.5610 - loss: 0.7698


<keras.src.callbacks.history.History at 0x7d477d0b9cd0>

In [19]:
texts = test["text"]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

def embed(corpus):
    return tokenizer.texts_to_sequences(corpus)

test_padded_sentences = pad_sequences(
    embed(texts),
    padding="post"
)


preds = model.predict(test_padded_sentences)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step


In [20]:
preds = (preds > 0.5).astype("int32")

submission = test.copy()
submission["target"] = preds
submission = submission.drop(["text"], axis=1)
submission.to_csv("submission.csv", index=False)

# Normy

In [21]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, DatasetDict

model_name="distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_ds = train.copy()
train_ds["input"] = train_ds["text"]
train_ds["labels"] = train_ds["target"]
train_ds = train_ds.drop(["text", "target"], axis=1)

test_ds = test.copy()
test_ds["input"] = test_ds["text"]
test_ds = test_ds.drop(["text"], axis=1)

train_ds = Dataset.from_pandas(train_ds)
eval_ds = Dataset.from_pandas(test_ds)

def tknize(text): return tokenizer(text["input"])

train_ds = train_ds.map(tknize, batched=True)
eval_ds = eval_ds.map(tknize, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [22]:
from sklearn.metrics import roc_auc_score
from scipy.special import softmax

train_ds = train_ds.train_test_split(0.25, seed=42)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = np.argmax(logits, axis=-1)
    auc = roc_auc_score(labels, probs)
    
    return {
        "auc": auc
    }

In [23]:
batch_size=32
epochs=3

model = AutoModelForSequenceClassification.from_pretrained(model_name)

args = TrainingArguments(
    "outputs", 
    learning_rate=8e-5, 
    warmup_ratio=0.1, 
    lr_scheduler_type="cosine", 
    fp16=True,
    eval_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size*2,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to='none'
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds["train"],
    eval_dataset=train_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Auc
1,No log,0.456087,0.780312
2,No log,0.464532,0.78769
3,0.366100,0.559091,0.790325


TrainOutput(global_step=537, training_loss=0.3549268862832636, metrics={'train_runtime': 42.2779, 'train_samples_per_second': 405.106, 'train_steps_per_second': 12.702, 'total_flos': 131148675353364.0, 'train_loss': 0.3549268862832636, 'epoch': 3.0})

In [25]:
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [26]:
eval_ds

Dataset({
    features: ['id', 'input', 'input_ids', 'attention_mask'],
    num_rows: 3263
})

In [27]:
predictions = trainer.predict(eval_ds).predictions.astype(float)
predictions = np.clip(predictions, 0, 1)[:, 1]

predictions = (predictions > 0.5).astype('int32')

submission = Dataset.from_dict({
    "id": eval_ds["id"],
    "target": predictions
})

submission.to_csv("submission.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

22746

In [28]:
df = pd.read_csv("submission.csv")
df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
