# Natural Language Processing with Disaster Tweets

https://www.kaggle.com/c/nlp-getting-started/data

In [186]:
import os
import pandas as pd


data_fp = os.path.join(os.getcwd(), "data")
train_fp = os.path.join(data_fp, "train.csv")
train = pd.read_csv(train_fp, encoding="utf-8")
test_fp = os.path.join(data_fp, "test.csv")
test = pd.read_csv(test_fp, encoding="utf-8")

print(f"Train: {train.shape}")
print(f"Test:  {test.shape}")

np.random.seed(1234)
dev_index = np.random.choice(len(train), size=500, replace=False)


dev = train.loc[dev_index]
train = train.loc[~train.index.isin(dev_index)]

Train: (7613, 5)
Test:  (3263, 4)


In [206]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

import textblob
import nltk
nltk.download('averaged_perceptron_tagger')

import numpy as np
from sklearn.metrics import classification_report, f1_score

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/icexelloss/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [211]:
# CountVectorizer + Bernoulli

vectorizer = CountVectorizer()
train_arr = vectorizer.fit_transform(train["text"])
train_labels = train["target"]
dev_arr = vectorizer.transform(dev['text'])
dev_labels = dev['target']
print(f"Bernoulli distribution, p = {train_labels.mean():.2f}")
model = BernoulliNB()
cv = cross_validate(model, train_arr, train_labels, cv=5, return_estimator=True)
mean_score = cv["test_score"].mean()
print(f"Mean accuracy on 5-fold cross validation: {mean_score:.2f}")
best_estimator = cv["estimator"][cv["test_score"].argmax()]

pred_labels = best_estimator.predict(dev_arr)

print("f1_score", f1_score(dev_labels, pred_labels, average="weighted"))

Bernoulli distribution, p = 0.43
Mean accuracy on 5-fold cross validation: 0.74
f1_score 0.8127917574796909


In [210]:
# CountVectorizer + MultinomialNB

vectorizer = CountVectorizer()
train_arr = vectorizer.fit_transform(train["text"])
train_labels = train["target"]
dev_arr = vectorizer.transform(dev['text'])
dev_labels = dev['target']
print(f"Bernoulli distribution, p = {train_labels.mean():.2f}")
model = MultinomialNB()
cv = cross_validate(model, train_arr, train_labels, cv=5, return_estimator=True)
mean_score = cv["test_score"].mean()
print(f"Mean accuracy on 5-fold cross validation: {mean_score:.2f}")
best_estimator = cv["estimator"][cv["test_score"].argmax()]

pred_labels = best_estimator.predict(dev_arr)

print("f1_score", f1_score(dev_labels, pred_labels, average="weighted"))

Bernoulli distribution, p = 0.43
Mean accuracy on 5-fold cross validation: 0.72
f1_score 0.7970814419983857


In [209]:
# TfidfVectorizer + Bernoulli

vectorizer = TfidfVectorizer()
train_arr = vectorizer.fit_transform(train["text"])
train_labels = train["target"]
dev_arr = vectorizer.transform(dev['text'])
dev_labels = dev['target']
print(f"Bernoulli distribution, p = {train_labels.mean():.2f}")
model = BernoulliNB()
cv = cross_validate(model, train_arr, train_labels, cv=5, return_estimator=True)
mean_score = cv["test_score"].mean()
print(f"Mean accuracy on 5-fold cross validation: {mean_score:.2f}")
best_estimator = cv["estimator"][cv["test_score"].argmax()]

pred_labels = best_estimator.predict(dev_arr)

print("f1_score", f1_score(dev_labels, pred_labels, average="weighted"))

Bernoulli distribution, p = 0.43
Mean accuracy on 5-fold cross validation: 0.74
f1_score 0.8127917574796909


In [207]:
# TfidfVectorizer + MultinomialNB

vectorizer = TfidfVectorizer()
train_arr = vectorizer.fit_transform(train["text"])
train_labels = train["target"]
dev_arr = vectorizer.transform(dev['text'])
dev_labels = dev['target']
print(f"Bernoulli distribution, p = {train_labels.mean():.2f}")
model = MultinomialNB()
cv = cross_validate(model, train_arr, train_labels, cv=5, return_estimator=True)
mean_score = cv["test_score"].mean()
print(f"Mean accuracy on 5-fold cross validation: {mean_score:.2f}")
best_estimator = cv["estimator"][cv["test_score"].argmax()]
pred_labels = best_estimator.predict(dev_arr)

print("f1_score", f1_score(dev_labels, pred_labels, average="weighted"))

Bernoulli distribution, p = 0.43
Mean accuracy on 5-fold cross validation: 0.74
f1_score 0.8034602905206409


In [216]:
keyword_weight = 10
hashtag_weight = 10

def extract_hashtags(text):
    words = text.split()
    words = [
        word[1:]
        for word in words
        if word.startswith('#')
    ]
    return words if words else np.nan

def fit_transform_text(texts, keywords):
    cv = TfidfVectorizer()
    arr = cv.fit_transform(texts)

    def tags_to_weight(tags, vocabulary=cv.vocabulary_, weight=hashtag_weight):
        weight = np.zeros(len(vocabulary))

        if tags is np.nan:
            return weight

        for tag in tags:
            if tag in vocabulary:
                print(tag)
                weight[vocabulary[tag]] = hashtag_weight

        return weight
    
    def keywords_to_weight(keyword, vocabulary=cv.vocabulary_, weight=hashtag_weight):
        weight = np.zeros(len(vocabulary))
        
        if tags is np.nan:
            return weight
        
        if keyword in vocabulary:
            weight[vocabulary[keyword]] = keyword_weight
        
        return weight
    
    hashtags = texts.apply(extract_hashtags)
    arr = arr + np.vstack(hashtags.apply(tags_to_weight).values)
    arr = arr + np.vstack(keywords.apply(tags_to_weight).values)
    return arr, cv
    
def transform_text(texts, keywords, cv):
    arr = cv.transform(texts)
    
    def tags_to_weight(tags, vocabulary=cv.vocabulary_, weight=hashtag_weight):
        weight = np.zeros(len(vocabulary))

        if tags is np.nan:
            return weight

        for tag in tags:
            if tag in vocabulary:
                weight[vocabulary[tag]] = hashtag_weight

        return weight
    
    def keywords_to_weight(keyword, vocabulary=cv.vocabulary_, weight=hashtag_weight):
        weight = np.zeros(len(vocabulary))
        
        if tags is np.nan:
            return weight
        
        if keyword in vocabulary:
            weight[vocabulary[keyword]] = keyword_weight
        
        return weight
    
    hashtags = texts.apply(extract_hashtags)
    arr = arr + np.vstack(hashtags.apply(tags_to_weight).values)
    arr = arr + np.vstack(keywords.apply(tags_to_weight).values)
    
    return arr
    
    
train_arr, cv = fit_transform_text(train['text'], train['keyword'])
dev_arr = transform_text(dev['text'], dev['keyword'], cv)

train_labels = train["target"]
dev_labels = dev["target"]

model = MultinomialNB()
cv = cross_validate(model, train_arr, train_labels, cv=5, return_estimator=True)
mean_score = cv["test_score"].mean()
print(f"Mean accuracy on 5-fold cross validation: {mean_score:.2f}")
best_estimator = cv["estimator"][cv["test_score"].argmax()]

pred_labels = best_estimator.predict(dev_arr)

print("f1_score", f1_score(dev_labels, pred_labels, average="weighted"))

earthquake
wildfires
wildfires
wildfires
flooding
raining
flooding
metal
mufc
nsfw
nsfw
climate
energy
personalinjury
solicitor
stlouis
caraccidentlawyer
traffic
traffic
lifehacks
silverwood
aftershock
now
wdyouth
biblestudy
news
yugvani
justsaying
randomthought
news
til_now
ems
paramedics
ambulance
reuters
worldnews
gilbert23
gilbert23
gilbert23
sciencefiction
internetradio
warmbodies
romance
seduction
apocalypse
eonlinechat
romance
seduction
apocalypse
startrek
tos
preseasonworkouts
brics
roberts
russia
cbcto
nativehuman
myreligion
tbt
palestine
volleyball
ableg
cdnpoli
nwo
pugprobs
robotcoingame
171
weddinghour
entretenimento
nowplay
listen
radio
throwback
world
news
phone
apple
mobile
world
orchardalley
nyc
gardens
trucking
anthrax
bioterrorism
virus
infectiousdiseases
bioterrorism
dvd
digitalhealth
hcsm
bioterrorism
glanders
raisinfingers
breaking
thisispublichealth
socialmedia
nowplaying
listenlive
stoponesounds
airwaves
escorts
gfe
arizona
realestate
oomf
2fast2furious
elxn42
st

Mean accuracy on 5-fold cross validation: 0.72
f1_score 0.7928964699683878
