### **HuggingFace Login**

In [1]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### **Import Libairies**

In [2]:
from IPython.display import display

import numpy as np
import pandas as pd
from tqdm import tqdm
import torch

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from transformers import Trainer, TrainingArguments
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import sentencepiece
from transformers import AlbertTokenizer, AlbertForSequenceClassification
from datasets import Dataset




### **Read Datasets**

In [4]:
"""
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", index_col=0)
df_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", index_col=0)

df_train = pd.read_csv("/media/yanncauchepin/ExternalDisk/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/train.csv", index_col=0)
df_test = pd.read_csv("/media/yanncauchepin/ExternalDisk/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/test.csv", index_col=0)
"""

df_train = pd.read_csv(
    "C:/Users/cauchepy/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/train.csv",
    index_col=0,
)
df_test = pd.read_csv(
    "C:/Users/cauchepy/Datasets/NaturalLanguageProcessing/kaggle_disastertweets/test.csv",
    index_col=0,
)

### **Short Analysis**

In [5]:
print(f"Length - train {len(df_train)} - test {len(df_test)}")

Length - train 7613 - test 3263


### **Preprocess Datasets**

##### _Merge columns (full)_

In [6]:
keywords = pd.concat(
    [
        pd.DataFrame(df_train["keyword"].value_counts()).rename(
            columns={"count": "train"}
        ),
        pd.DataFrame(df_test["keyword"].value_counts()).rename(
            columns={"count": "test"}
        ),
    ],
    axis=1,
)
keywords.head()

Unnamed: 0_level_0,train,test
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
fatalities,45,5
deluge,42,8
armageddon,42,8
sinking,41,9
damage,41,9


In [7]:
locations = pd.concat(
    [
        pd.DataFrame(df_train["location"].value_counts()).rename(
            columns={"count": "train"}
        ),
        pd.DataFrame(df_test["location"].value_counts()).rename(
            columns={"count": "test"}
        ),
    ],
    axis=1,
)
locations.head()

Unnamed: 0_level_0,train,test
location,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,104.0,37.0
New York,71.0,38.0
United States,50.0,15.0
London,45.0,13.0
Canada,29.0,13.0


In [8]:
df_train_full = df_train.copy()
df_train_full["text"] = df_train_full.apply(
    lambda row: f"{row['location'] or ''} {row['keyword'] or ''} {row['text']}".strip(),
    axis=1,
)
df_test_full = df_test.copy()
df_test_full["text"] = df_test_full.apply(
    lambda row: f"{row['location'] or ''} {row['keyword'] or ''} {row['text']}".strip(),
    axis=1,
)

##### _NLP encoding + split validation (standard)_

In [9]:
vectorizer = CountVectorizer(stop_words="english")
train_texts_vec = vectorizer.fit_transform(df_train["text"])
train_texts_vec.todense()

tfidf = TfidfTransformer()
train_texts_tfidf = tfidf.fit_transform(train_texts_vec)
train_texts_tfidf = train_texts_tfidf.todense()

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    train_texts_tfidf,
    df_train["target"],
    test_size=0.2,
    stratify=df_train["target"],
    random_state=0,
)

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_valid = np.asarray(X_valid)
y_valid = np.asarray(y_valid)

test_texts_vec = vectorizer.transform(df_test["text"])
test_texts_vec.todense()
test_texts_tfdif = tfidf.transform(test_texts_vec)
test_texts_tfdif = test_texts_tfdif.todense()
X_test = np.asarray(test_texts_tfdif)

##### _NLP encoding (full)_

In [10]:
vectorizer_full = CountVectorizer(stop_words="english")
train_full_texts_vec = vectorizer_full.fit_transform(df_train_full["text"])
train_full_texts_vec.todense()

tfidf_full = TfidfTransformer()
train_full_texts_tfidf = tfidf_full.fit_transform(train_full_texts_vec)
train_full_texts_tfidf = train_full_texts_tfidf.todense()

X_full_train = np.asarray(train_full_texts_tfidf)
y_full_train = np.asarray(df_train_full["target"])

test_full_texts_vec = vectorizer_full.transform(df_test_full["text"])
test_full_texts_vec.todense()
test_full_texts_tfdif = tfidf_full.transform(test_full_texts_vec)
test_full_texts_tfdif = test_full_texts_tfdif.todense()
X_full_test = np.asarray(test_full_texts_tfdif)

### **Model Assessment**

In [11]:
def evaluate_classifier(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average="weighted")
    precision = precision_score(y_true, y_pred, average="weighted")
    recall = recall_score(y_true, y_pred, average="weighted")
    cm = confusion_matrix(y_true, y_pred)

    metrics_df = pd.DataFrame(
        {"Value": [f1, precision, recall]}, index=["F1 Score", "Precision", "Recall"]
    )

    cm_df = pd.DataFrame(
        cm,
        columns=["Predicted Negative", "Predicted Positive"],
        index=["Actual Negative", "Actual Positive"],
    )

    return metrics_df, cm_df

### **Modeling**

### Naives Bayes

##### _(standard) Naive Bayes + assessment_

In [12]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)
y_pred = naive_bayes_classifier.predict(X_valid)

naive_bayes_classifier_assessement = evaluate_classifier(y_valid, y_pred)
display(naive_bayes_classifier_assessement[0])
display(naive_bayes_classifier_assessement[1])

Unnamed: 0,Value
F1 Score,0.796308
Precision,0.807306
Recall,0.801051


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,790,79
Actual Positive,224,430


##### _(full) Naive Bayes + df submission_

In [13]:
naive_bayes_classifier_full = MultinomialNB()
naive_bayes_classifier_full.fit(X_full_train, y_full_train)
y_full_pred = naive_bayes_classifier_full.predict(X_full_test)

naive_bayes_classifier_full_submission = pd.DataFrame(
    {"id": df_test_full.index, "target": y_full_pred}
)

### Logistic Regression

##### _(standard) Logistic Regression + assessment_

In [14]:
logistic_regression_classifier = LogisticRegression()
logistic_regression_classifier.fit(X_train, y_train)
y_pred = logistic_regression_classifier.predict(X_valid)

logistic_regression_classifier_assessement = evaluate_classifier(y_valid, y_pred)
display(logistic_regression_classifier_assessement[0])
display(logistic_regression_classifier_assessement[1])

Unnamed: 0,Value
F1 Score,0.794483
Precision,0.810064
Recall,0.800394


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,801,68
Actual Positive,236,418


##### _(full) Logistic Regression + df submission_

In [15]:
logistic_regression_classifier_full = LogisticRegression()
logistic_regression_classifier_full.fit(X_full_train, y_full_train)
y_full_pred = logistic_regression_classifier_full.predict(X_full_test)

logistic_regression_classifier_full_submission = pd.DataFrame(
    {"id": df_test_full.index, "target": y_full_pred}
)

### Random Forest

##### _(standard) Random Forest + assessment_

In [16]:
random_forest_classifier = RandomForestClassifier()
random_forest_classifier.fit(X_train, y_train)
y_pred = random_forest_classifier.predict(X_valid)

random_forest_classifier_assessement = evaluate_classifier(y_valid, y_pred)
display(random_forest_classifier_assessement[0])
display(random_forest_classifier_assessement[1])

KeyboardInterrupt: 

##### _(full) Random Forest + df submission_

In [42]:
random_forest_classifier_full = RandomForestClassifier()
random_forest_classifier_full.fit(X_full_train, y_full_train)
y_full_pred = random_forest_classifier_full.predict(X_full_test)

random_forest_classifier_full_submission = pd.DataFrame(
    {"id": df_test_full.index, "target": y_full_pred}
)

### Neural Network

##### _(standard) Neural Network + assessment_

In [54]:
neural_network_classifier = Sequential(
    [
        Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)

neural_network_classifier.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)
early_stopping = EarlyStopping(
    monitor="val_loss", patience=3, restore_best_weights=True
)
history = neural_network_classifier.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=8,
    validation_data=(X_valid, y_valid),
    callbacks=[early_stopping],
)

y_pred = neural_network_classifier.predict(X_valid)

neural_network_classifier_assessment = evaluate_classifier(y_valid, np.round(y_pred))
display(neural_network_classifier_assessment[0])
display(neural_network_classifier_assessment[1])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m762/762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6702 - loss: 0.5907 - val_accuracy: 0.8089 - val_loss: 0.4415
Epoch 2/10
[1m762/762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9277 - loss: 0.1965 - val_accuracy: 0.7610 - val_loss: 0.5566
Epoch 3/10
[1m762/762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9794 - loss: 0.0646 - val_accuracy: 0.7623 - val_loss: 0.6705
Epoch 4/10
[1m762/762[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9890 - loss: 0.0311 - val_accuracy: 0.7656 - val_loss: 0.8040
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


Unnamed: 0,Value
F1 Score,0.80557
Precision,0.812378
Recall,0.80893


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,782,87
Actual Positive,204,450


##### _(full) Neural Network + df submission_

In [16]:
X_full_train_, X_full_valid, y_full_train_, y_full_valid = train_test_split(
    X_full_train, y_full_train, test_size=0.15, stratify=y_full_train, random_state=0
)

neural_network_classifier_full = Sequential(
    [
        Dense(64, activation="relu", input_shape=(X_full_train_.shape[1],)),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)

neural_network_classifier_full.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)
early_stopping_full = EarlyStopping(
    monitor="val_loss", patience=3, restore_best_weights=True
)
history_full = neural_network_classifier_full.fit(
    X_full_train_,
    y_full_train_,
    epochs=20,
    batch_size=8,
    validation_data=(X_full_valid, y_full_valid),
    callbacks=[early_stopping],
)

y_full_pred = neural_network_classifier_full.predict(X_full_test)

neural_network_classifier_full_submission = pd.DataFrame(
    {"id": df_test_full.index, "target": np.round(y_full_pred).flatten()}
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6909 - loss: 0.5828 - val_accuracy: 0.8082 - val_loss: 0.4470
Epoch 2/20
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9209 - loss: 0.2122 - val_accuracy: 0.7872 - val_loss: 0.5542
Epoch 3/20
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9762 - loss: 0.0664 - val_accuracy: 0.7618 - val_loss: 0.7054
Epoch 4/20
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9915 - loss: 0.0285 - val_accuracy: 0.7688 - val_loss: 0.8807
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


### XGBoost

##### _(standard) XGBoost + assessment_

In [17]:
xgb_classifier = XGBClassifier(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric="logloss",
)
xgb_classifier.fit(X_train, y_train, verbose=1)

y_pred = xgb_classifier.predict(X_valid)

xgb_classifier_assessement = evaluate_classifier(y_valid, y_pred)
display(xgb_classifier_assessement[0])
display(xgb_classifier_assessement[1])

Parameters: { "use_label_encoder" } are not used.



(      Metric     Value
0   F1 Score  0.785225
1  Precision  0.794514
2     Recall  0.789888,                  Predicted Negative  Predicted Positive
Actual Negative                 778                  91
Actual Positive                 229                 425)


##### _(full) XGBoost + df submission_

In [None]:
xgb_classifier_full = XGBClassifier(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric="logloss",
)
xgb_classifier_full.fit(X_full_train, y_full_train, verbose=1)

y_full_pred = xgb_classifier_full.predict(X_full_test)

xgb_classifier_full_submission = pd.DataFrame(
    {"id": df_test_full.index, "target": y_full_pred}
)

Parameters: { "use_label_encoder" } are not used.



### CatBoost

##### _(standard) CatBoost + assessment_

In [55]:
catboost_classifier = CatBoostClassifier(
    iterations=1000, depth=4, learning_rate=0.1, verbose=2
)
catboost_classifier.fit(X_train, y_train)

y_pred = catboost_classifier.predict(X_valid)

catboost_classifier_assessement = evaluate_classifier(y_valid, y_pred)
display(catboost_classifier_assessement[0])
display(catboost_classifier_assessement[1])

0:	learn: 0.6797045	total: 150ms	remaining: 2m 30s
2:	learn: 0.6640753	total: 185ms	remaining: 1m 1s
4:	learn: 0.6527238	total: 218ms	remaining: 43.5s
6:	learn: 0.6418404	total: 262ms	remaining: 37.2s
8:	learn: 0.6339999	total: 297ms	remaining: 32.7s
10:	learn: 0.6280628	total: 331ms	remaining: 29.8s
12:	learn: 0.6209378	total: 365ms	remaining: 27.7s
14:	learn: 0.6162400	total: 402ms	remaining: 26.4s
16:	learn: 0.6115203	total: 437ms	remaining: 25.3s
18:	learn: 0.6076645	total: 473ms	remaining: 24.4s
20:	learn: 0.6030091	total: 508ms	remaining: 23.7s
22:	learn: 0.5995534	total: 542ms	remaining: 23s
24:	learn: 0.5964546	total: 577ms	remaining: 22.5s
26:	learn: 0.5930054	total: 612ms	remaining: 22.1s
28:	learn: 0.5902966	total: 647ms	remaining: 21.6s
30:	learn: 0.5873470	total: 681ms	remaining: 21.3s
32:	learn: 0.5849357	total: 715ms	remaining: 20.9s
34:	learn: 0.5827788	total: 750ms	remaining: 20.7s
36:	learn: 0.5798956	total: 788ms	remaining: 20.5s
38:	learn: 0.5778346	total: 823ms	rem

Unnamed: 0,Value
F1 Score,0.78015
Precision,0.788278
Recall,0.784636


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,771,98
Actual Positive,230,424


##### _(full) CatBoost + df submission_

In [None]:
catboost_classifier_full = CatBoostClassifier(
    iterations=1000, depth=4, learning_rate=0.1, verbose=2
)
catboost_classifier_full.fit(X_full_train, y_full_train)

y_full_pred = catboost_classifier_full.predict(X_full_test)

catboost_classifier_full_submission = pd.DataFrame(
    {"id": df_test_full.index, "target": y_full_pred}
)

0:	learn: 0.6830062	total: 28.3ms	remaining: 28.3s
2:	learn: 0.6635779	total: 83.1ms	remaining: 27.6s
4:	learn: 0.6506161	total: 136ms	remaining: 27s
6:	learn: 0.6415673	total: 188ms	remaining: 26.7s
8:	learn: 0.6337147	total: 240ms	remaining: 26.5s
10:	learn: 0.6258401	total: 295ms	remaining: 26.6s
12:	learn: 0.6205170	total: 347ms	remaining: 26.3s
14:	learn: 0.6152465	total: 400ms	remaining: 26.3s
16:	learn: 0.6112210	total: 453ms	remaining: 26.2s
18:	learn: 0.6066602	total: 507ms	remaining: 26.2s
20:	learn: 0.6036622	total: 559ms	remaining: 26.1s
22:	learn: 0.6004812	total: 612ms	remaining: 26s
24:	learn: 0.5975389	total: 665ms	remaining: 25.9s
26:	learn: 0.5946601	total: 718ms	remaining: 25.9s
28:	learn: 0.5914057	total: 771ms	remaining: 25.8s
30:	learn: 0.5874459	total: 824ms	remaining: 25.8s
32:	learn: 0.5840149	total: 880ms	remaining: 25.8s
34:	learn: 0.5819688	total: 936ms	remaining: 25.8s
36:	learn: 0.5796179	total: 989ms	remaining: 25.7s
38:	learn: 0.5774566	total: 1.04s	rema