In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
np.random.seed(180)
random.seed(180)

#### [Natural Language Processing with Disaster Tweets Dataset](https://www.kaggle.com/competitions/nlp-getting-started/overview)

In [3]:
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df_train["target"].value_counts(normalize=True)

target
0    0.57034
1    0.42966
Name: proportion, dtype: float64

In [6]:
text_id = df_test["id"].tolist()
test_x = df_test["text"].tolist()

In [7]:
df_0 = df_train[df_train["target"] == 0]["text"].tolist()
df_1 = df_train[df_train["target"] == 1]["text"].tolist()

train_x = df_0 + df_1
train_y = [0]*len(df_0) + [1]*len(df_1)

##### **Pre-processing text methods**

1. `process_tweet()`: cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.

2. `count_tweets()`: takes a list of tweets as input, `process_tweet()` all of them, and returns a dictionary.
  * The key in the dictionary is a tuple containing the semmed word and its class label, e.g. ("happi",1).
  * The value the number of times this word appears in the given collection of tweets (an integer).

In [8]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yveem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words("english")
    # remove stock market tickers like $GE
    tweet = re.sub(r"\$\w*", "", tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r"^RT[\s]+", "", tweet)
    # remove hyperlinks
    tweet = re.sub(r"https?:\/\/.*[\r\n]*", "", tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r"#", "", tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

def lookup(freqs, word, label):
    n = 0

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

def count_tweets(tweets, ys):
    result = {}
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1

    return result

In [10]:
process_tweet("""
For those who have been waiting for this scene...

Now, a mass exodus of settlers from northern occupied Palestine have left their homes burning and are fleeing.
""")

['wait',
 'scene',
 '...',
 'mass',
 'exodu',
 'settler',
 'northern',
 'occupi',
 'palestin',
 'left',
 'home',
 'burn',
 'flee']

In [11]:
tweets = ["i am happy", "i am tricked", "i am sad", "i am tired", "i am tired"]
ys = [1, 0, 0, 0, 0]
count_tweets(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [12]:
freqs = count_tweets(train_x, train_y)

#### **Features Extractions**

##### Given a list of tweets, let's extract the features and store them into a matrix. I'm gonna extract two features.
  
  * [0] the number of positive for disaster words in a tweet.
  * [1] the number of negative for disaster words in a tweet.

In [13]:
def extract_features(tweet, freqs):
    words = process_tweet(tweet)

    x = np.zeros((1, 3))
    x[0, 0] = 1 # bias

    for word in words:
        x[0, 1] += freqs.get((word, 1.0), 0)
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert(x.shape == (1, 3)), "Invalid feature shape."
    return x

In [14]:
X = np.zeros((len(train_x), 3))
n = len(train_y)
Y = np.array(train_y).reshape((n, 1))

for i in range(len(train_x)):
    X[i, :] = extract_features(train_x[i], freqs)

In [15]:
import xgboost as xgb

In [16]:
dtrain = xgb.DMatrix(X, label=Y)

In [17]:
params = {
    "objective": "binary:logistic",
    "max_depth": 3,
    "eta": 0.1,
    "eval_metric": "logloss"
}

num_boost_round = 100
bst = xgb.train(params, dtrain, num_boost_round)

In [18]:
def predict_tweet(tweet, model, freqs):
    features = extract_features(tweet, freqs)
    pred_prob = model.predict(xgb.DMatrix(features))

    return pred_prob[0]

In [19]:
my_tweet = """
For those who have been waiting for this scene...

Now, a mass exodus of settlers from northern occupied Palestine have left their homes burning and are fleeing.
"""
p = predict_tweet(my_tweet, bst, freqs)
print("The expected output is", p)

The expected output is 0.74803406


In [20]:
my_tweet = """
My life is good now that I found you! I'm happy with u.
"""
p = predict_tweet(my_tweet, bst, freqs)
print("The expected output is", p)

The expected output is 0.22223593


In [21]:
preds = []
ids = []
for _, row in df_test.iterrows():
  tweet = row["text"]
  ids.append(row["id"])
  if predict_tweet(tweet, bst, freqs) > 0.5:
    preds.append(1)
  else:
    preds.append(0)

In [22]:
df_submit = pd.DataFrame({"id": ids, "target": preds})

In [23]:
df_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [24]:
df_submit.to_csv("./results/xgboost.csv", index=False)