https://www.kaggle.com/c/nlp-getting-started/overview

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
DATA_DIR = Path('../data')

In [None]:
df = pd.read_csv(DATA_DIR / 'train.csv')
print(df.shape)
df.head()

In [None]:
df.target.value_counts()

In [None]:
df.isna().sum() / len(df)

In [None]:
df = df.drop(columns=['id', 'keyword', 'location'])
df.head()

In [None]:
X, y = df.text, df.target

**Split dataset to treain and test**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
    lowercase=True, strip_accents='ascii', token_pattern=r'\b[^\d\W]+\b', stop_words='english')

In [None]:
tfidf_vectorizer.fit(X_train)

In [None]:
X_train_tfidf_features = tfidf_vectorizer.transform(X_train)
X_train_tfidf_features

## Model

In [None]:
X_train_features = X_train_tfidf_features

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3, n_estimators=100, random_state=42)

In [None]:
clf.fit(X_train_features, y_train)

## Evaluation

### Preprocess test set

In [None]:
X_test_tfidf_features = tfidf_vectorizer.transform(X_test)
X_test_tfidf_features

In [None]:
X_test_features = X_test_tfidf_features

### Make predictions on test set

In [None]:
y_pred = clf.predict(X_test_features)

### Compute metric

In [None]:
from sklearn.metrics import plot_confusion_matrix

np.set_printoptions(precision=2)
fig, ax = plt.subplots(figsize=(10, 10))
disp = plot_confusion_matrix(clf, X_test_features, y_test,
                             display_labels=['no disaster', 'is disaster'],
                             cmap=plt.cm.Blues, normalize='true', ax=ax)
disp.ax_.set_title('Normalized confusion matrix')
plt.show()