In [1]:
import pandas as pd
import numpy as np
import random

##### It's good academic practice to set a reproducible solution, so let's set seeds!

In [2]:
np.random.seed(42)
random.seed(42)

#### **Getting the Data**

##### The data used on this project was collected in the [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/competitions/nlp-getting-started/overview) competition.

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


##### **Balancing data**

In [5]:
df_train['target'].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [6]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]

df_majority_downsampled = df_majority.sample(len(df_minority))
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled = df_downsampled.sample(frac=1).reset_index(drop=True)

In [7]:
df_train = df_downsampled.copy()
df_train['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

##### **Reshaping data**

In [8]:
text_id = df_test['id'].tolist()
test_x = df_test['text'].tolist()

In [9]:
df_0 = df_train[df_train['target'] == 0]['text'].tolist()
df_1 = df_train[df_train['target'] == 1]['text'].tolist()

train_x = df_0 + df_1
train_y = [0]*len(df_0) + [1]*len(df_1)

##### **Pre-processing text**

1. `process_tweet()`: cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.

2. `count_tweets()`: this takes a list of tweets as input, cleans all of them, and returns a dictionary.
  * The key in the dictionary is a tuple containing the semmed word and its class label, e.g. ("happi",1).
  * The value the number of times this word appears in the given collection of tweets (an integer).

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

def lookup(freqs, word, label):
    n = 0

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in process_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1

    return result

In [12]:
process_tweet('This is my pre-processed text for exibition!')

['pre-process', 'text', 'exibit']

In [13]:
result = {}
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

#### **Naive Bayes**

##### Naive bayes is an algorithm that could be used for disaster analysis. It takes a short time to train and also has a short prediction time.

1. **How to classify documents?**

  * $P(D_{pos})=\frac{D_{pos}}{D}$: Probability the document describes positive for disaster.

  * $P(D_{neg})=\frac{D_{neg}}{D}$: Probability the document describes negative for disaster.

2. **Positive and Negative Probability of a Word**

  To properly compute the probability of a word, the Laplace smoothing is used to overcome the problem of zero probability: ***If query point contains a new observation, which is not yet seen in training data while calculating probabilities.*** [Reference for Additive Smoothing!](https://towardsdatascience.com/laplace-smoothing-in-na%C3%AFve-bayes-algorithm-9c237a8bdece)

  $$
  P(W_{pos})=\frac{freq_{pos} + 1}{N_{pos}+V}
  $$

  $$
  P(W_{neg})=\frac{freq_{neg} + 1}{N_{neg}+V}
  $$

4. **Log Likelihood**

  To compute the loglikelihood of that very same word, we can implement the following equations:

  $$
  loglikelihood = log(\frac{P(W_{pos})}{P(W_{neg})})
  $$

  The likelihood function is the joint probability (or probability density) of observed data viewed as a function of the parameters of a statistical model.

In [14]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)
    N_pos = N_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            N_pos += freqs[pair]
        else:
            N_neg += freqs[pair]

    D = len(train_y)
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = lookup(freqs,word,1)
        freq_neg = lookup(freqs,word,0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

In [15]:
freqs = count_tweets({}, train_x, train_y)
freqs

{("i'll", 0): 22,
 ('drown', 0): 54,
 ('river', 0): 15,
 ('walk', 0): 18,
 ('got', 0): 65,
 ('cute', 0): 6,
 ('lil', 0): 7,
 ('cliff', 0): 19,
 ('fall', 0): 33,
 ('reddit', 0): 28,
 ('quarantin', 0): 41,
 ('offens', 0): 18,
 ('content', 0): 35,
 ('co-found', 0): 3,
 ('ceo', 0): 6,
 ('steve', 0): 4,
 ('huffman', 0): 3,
 ('unveil', 0): 3,
 ('specif', 0): 3,
 ('...', 0): 308,
 ('haha', 0): 13,
 ('love', 0): 87,
 ("cruise'", 0): 1,
 ('5', 0): 35,
 ('emerg', 0): 63,
 ('plan', 0): 39,
 ('awesom', 0): 9,
 ('fail', 0): 2,
 ('govern', 0): 12,
 ('concern', 0): 8,
 ('popul', 0): 5,
 ('explos', 0): 9,
 ('joe', 0): 7,
 ('moor', 0): 1,
 ('last', 0): 47,
 ('month', 0): 8,
 ('went', 0): 13,
 ('ee', 0): 1,
 ('shop', 0): 5,
 ('glad', 0): 8,
 ('blown', 0): 19,
 ('yet', 0): 18,
 ('may', 0): 25,
 ('ask', 0): 13,
 ('cyclone-sama', 0): 1,
 ('read', 0): 44,
 ('jail', 0): 2,
 ('fate', 0): 3,
 ('rindou', 0): 1,
 ('new', 0): 126,
 ('post', 0): 19,
 ('blog', 0): 5,
 ('thesensualey', 0): 1,
 ('model', 0): 1,
 ('ca

In [16]:
freqs = count_tweets({}, train_x, train_y)

logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
10948


#### **Predictions**

##### Given a tweet, what is the probability of being a disaster description?

In [17]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = process_tweet(tweet)
    p = 0
    p += logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]

    return p

In [18]:
my_tweet = 'A misery has taken place at Palestine today. I feel bad for all the lost people over there.'
p = naive_bayes_predict(my_tweet, logprior, loglikelihood)
print('The expected output is', p)

The expected output is -0.05840378067758978


In [19]:
preds = []
ids = []
for _, row in df_test.iterrows():
  tweet = row['text']
  ids.append(row['id'])
  if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
    preds.append(1)
  else:
    preds.append(0)

In [20]:
df_submit = pd.DataFrame({'id': ids, 'target': preds})

In [21]:
df_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [22]:
df_submit.to_csv('./data/result.csv', index=False)