#### Import libraries

In [21]:
import sys
import pandas as pd
import os
import re
import urllib
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Load Data

In [2]:
train_df = pd.read_csv('../data/disaster_train.csv')
test_df = pd.read_csv('../data/disaster_test.csv')

  train_df = pd.read_csv('../data/disaster_train.csv')
  test_df = pd.read_csv('../data/disaster_test.csv')


In [3]:
X_train = train_df['message']
y_train = train_df.iloc[:,4:]
X_test = test_df['message']
y_test = test_df.iloc[:,4:] 

### Building model

In [13]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for token in tokens:
        clean_token = lemmatizer.lemmatize(token).lower().strip()
        clean_tokens.append(clean_token)
    return clean_tokens

In [17]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize, stop_words='english')),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('clf', MultiOutputClassifier(AdaBoostClassifier())) 
])

pipeline

In [18]:
parameters = {
    'clf__estimator__n_estimators': [50, 100, 200],      # Số lượng mô hình con (cây quyết định) trong AdaBoost
    'clf__estimator__learning_rate': [0.5, 1.0, 2.0]     # Tỷ lệ học của AdaBoost
}

# Tạo GridSearchCV 
cv = GridSearchCV(pipeline, param_grid=parameters, cv=5, n_jobs=-1, verbose=3)

# Huấn luyện mô hình với GridSearchCV
cv.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




In [19]:
y_pred = cv.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89     20094
           1       0.81      0.55      0.66      4474
           2       0.71      0.23      0.35       118
           3       0.79      0.63      0.70     10860
           4       0.68      0.22      0.33      2084
           5       0.79      0.30      0.44      1313
           6       0.76      0.18      0.29       724
           7       0.73      0.06      0.12       471
           8       0.73      0.29      0.42       860
           9       0.79      0.64      0.71      1672
          10       0.82      0.74      0.78      2923
          11       0.82      0.55      0.66      2314
          12       0.84      0.49      0.62       405
          13       0.73      0.28      0.41       604
          14       0.81      0.19      0.31       298
          15       0.71      0.23      0.35       875
          16       0.83      0.41      0.55      1194
          17       0.65    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall accuracy: {accuracy:.2f}")

accuracies = {}
for i, column in enumerate(y_test.columns):
    accuracies[column] = accuracy_score(y_test[column], y_pred[:, i])
    print(f"Accuracy for {column}: {accuracies[column]:.2f}")


Overall accuracy: 0.30
Accuracy for related: 0.82
Accuracy for request: 0.90
Accuracy for offer: 1.00
Accuracy for aid_related: 0.78
Accuracy for medical_help: 0.93
Accuracy for medical_products: 0.96
Accuracy for search_and_rescue: 0.98
Accuracy for security: 0.98
Accuracy for military: 0.97
Accuracy for water: 0.97
Accuracy for food: 0.95
Accuracy for shelter: 0.95
Accuracy for clothing: 0.99
Accuracy for money: 0.98
Accuracy for missing_people: 0.99
Accuracy for refugees: 0.97
Accuracy for death: 0.97
Accuracy for other_aid: 0.88
Accuracy for infrastructure_related: 0.94
Accuracy for transport: 0.96
Accuracy for buildings: 0.96
Accuracy for electricity: 0.98
Accuracy for tools: 1.00
Accuracy for hospitals: 0.99
Accuracy for shops: 1.00
Accuracy for aid_centers: 0.99
Accuracy for other_infrastructure: 0.96
Accuracy for weather_related: 0.88
Accuracy for floods: 0.96
Accuracy for storm: 0.94
Accuracy for fire: 0.99
Accuracy for earthquake: 0.97
Accuracy for cold: 0.99
Accuracy for oth

In [20]:
with open('classifier.pkl', 'wb') as file:
    pickle.dump(cv, file)