#### Import libraries

In [8]:
import sys
import pandas as pd
import os
import re
import urllib
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sqlalchemy import create_engine


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Load Data

In [3]:
train_df = pd.read_csv('../data/disaster_train.csv')
test_df = pd.read_csv('../data/disaster_test.csv')

  train_df = pd.read_csv('../data/disaster_train.csv')
  test_df = pd.read_csv('../data/disaster_test.csv')


In [4]:
X_train = train_df['message']
y_train = train_df.iloc[:,4:]
X_test = test_df['message']
y_test = test_df.iloc[:,4:] 

### Building model

In [6]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for token in tokens:
        clean_token = lemmatizer.lemmatize(token).lower().strip()
        clean_tokens.append(clean_token)
    return clean_tokens

In [12]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ]))
    ])),
    ('clf', MultiOutputClassifier(RandomForestClassifier())) 
])

pipeline

In [14]:
# Tham số cần tinh chỉnh cho RandomForestClassifier
parameters = {
    'clf__estimator__n_estimators': [50, 100, 200],          # Số lượng cây quyết định
    'clf__estimator__max_depth': [None, 10, 20, 30],          # Độ sâu tối đa của các cây quyết định
    'clf__estimator__min_samples_split': [2, 5, 10],          # Số mẫu tối thiểu để phân chia một nút
    'clf__estimator__min_samples_leaf': [1, 2, 4]             # Số mẫu tối thiểu trên mỗi lá cây
}

# Tinh chỉnh tham số bằng GridSearchCV
cv = GridSearchCV(pipeline, param_grid=parameters, cv=5, n_jobs=-1, verbose=3)

# Huấn luyện mô hình
cv.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [None]:
# Đánh giá mô hình
y_pred = cv.predict(X_test)

# Đánh giá hiệu suất
print(classification_report(y_test, y_pred))