In [1]:
import pandas as pd
import numpy as np
import random

##### It's good academic practice to set a reproducible solution, so let's set seeds!

In [2]:
np.random.seed(42)
random.seed(42)

#### **Getting the Data**

##### The data used on this project was collected in the [Natural Language Processing with Disaster Tweets](https://www.kaggle.com/competitions/nlp-getting-started/overview) competition.

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [4]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


##### **Balancing data**

In [5]:
df_train['target'].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [6]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]

df_majority_downsampled = df_majority.sample(len(df_minority))
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled = df_downsampled.sample(frac=1).reset_index(drop=True)

In [7]:
df_train = df_downsampled.copy()
df_train['target'].value_counts(normalize=True)

0    0.5
1    0.5
Name: target, dtype: float64

##### **Reshaping data**

In [8]:
text_id = df_test['id'].tolist()
test_x = df_test['text'].tolist()

In [9]:
df_0 = df_train[df_train['target'] == 0]['text'].tolist()
df_1 = df_train[df_train['target'] == 1]['text'].tolist()

train_x = df_0 + df_1
train_y = np.append(np.zeros((len(df_0), 1)), np.ones((len(df_1), 1)), axis=0)

##### **Pre-processing text**

1. `process_tweet()`: cleans the text, tokenizes it into separate words, removes stopwords, and converts words to stems.

2. `build_freqs()`: this counts how often a word in the 'corpus' (the entire set of tweets) was associated with a positive for disaster label or a negative for disaster label, then builds the freqs dictionary, where each key is a `(word, label)` tuple, and the value is the count of its frequency within the corpus of tweets.

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
import re
import string
import numpy as np

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def process_tweet(tweet):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):
            stem_word = stemmer.stem(word)
            tweets_clean.append(stem_word)

    return tweets_clean

def build_freqs(tweets, ys):
    yslist = np.squeeze(ys).tolist()
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [12]:
process_tweet('This is my pre-processed text for exibition!')

['pre-process', 'text', 'exibit']

In [13]:
freqs = build_freqs(df_train['text'].tolist(), df_train['target'].tolist())

#### **Logistic Regression**

##### To process the modeling the probability of a discrete outcome given an input variable, I'm gonna use the logistic regression model and cover it its main topics here.

1. **Sigmoid function**: It maps the input `z` to a value that ranges between 0 and 1, and so it can be treated as a probability.

$$
\sigma(z) = \frac{1}{1 + e^{-z}}
$$

2. **Cost function and Gradient**: the cost function used for logistic regression is the average of the log loss across all training examples.


$$
J(\theta) = -\frac{1}{m} \sum_{i=1}^{m} \left[ y^{(i)}\log(h(z(\theta)^{(i)})) + (1 - y^{(i)})\log(1 - h(z(\theta)^{(i)})) \right]
$$

> $m$ is the number of training examples.

> $y^{(i)}$ is the actual label of the i-th training example.

> $h(z(\theta)^{(i)})$ is the model's prediction for the i-th training example.

3. **Loss function for a single training example**

$$
\text{Loss}(\theta; x, y) = -\left[y^{(i)}\log(h(z(\theta)^{(i)})) + (1 - y^{(i)})\log(1 - h(z(\theta)^{(i)})) \right]
$$

4. **Weight Update**: to update the weight vector, I will apply gradient descent to iteratively improve the model's predictions.

$$
\theta_j = \theta_j - \alpha \frac{1}{m} \sum_{i=1}^{m} \left( h(z(\theta)^{(i)}) - y^{(i)} \right)x_j
$$

In [15]:
def sigmoid(z):
    h = 1 / (1 + np.exp(-z))
    return h

In [16]:
def gradient_descent(x, y, theta, alpha, num_iters):
    m, _ = x.shape

    for i in range(num_iters):
        z = np.dot(x, theta)
        h = sigmoid(z)

        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))
        theta = theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))

    J = float(J)
    return J, theta

In [17]:
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

tmp_J, tmp_theta = gradient_descent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 100)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is 0.59916828.
The resulting vector of weights is [2.4e-07, 0.00023025, 0.00019856]


#### **Features Extractions**

##### Given a list of tweets, let's extract the features and store them into a matrix. I'm gonna extract two features.
  
  * [0] the number of positive for disaster words in a tweet.
  * [1] the number of negative for disaster words in a tweet.

In [18]:
def extract_features(tweet, freqs):
    words = process_tweet(tweet)

    x = np.zeros((1, 3))
    x[0, 0] = 1 # bias

    for word in words:
        x[0, 1] += freqs.get((word, 1.0), 0)
        x[0, 2] += freqs.get((word, 0.0), 0)

    assert(x.shape == (1, 3)), "Invalid feature shape."
    return x

In [19]:
X = np.zeros((len(train_x), 3))
Y = train_y

for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1000000)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

The cost after training is 0.52954405.
The resulting vector of weights is [-2.382e-05, 0.00638387, -0.0082585]


#### **Predictions**

##### Given a tweet, what is the probability of being a disaster description?

In [20]:
def predict_tweet(tweet, freqs, theta):
    x = extract_features(tweet,freqs)
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [21]:
for tweet in ['This is a misery!', 'I am bad', 'What a disaster!', 'Lovely day today! Nothing bad at all.', 'bomb just exploded at Vitnam!']:
    print('%s -> %f' % (tweet, predict_tweet(tweet, freqs, theta)))

This is a misery! -> 0.499525
I am bad -> 0.454609
What a disaster! -> 0.619429
Lovely day today! Nothing bad at all. -> 0.234993
bomb just exploded at Vitnam! -> 0.642627


In [22]:
preds = []
ids = []
for _, row in df_test.iterrows():
  tweet = row['text']
  ids.append(row['id'])
  if predict_tweet(tweet, freqs, theta) > 0.5:
    preds.append(1)
  else:
    preds.append(0)

In [23]:
df_submit = pd.DataFrame({'id': ids, 'target': preds})

In [24]:
df_submit.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [25]:
df_submit.to_csv('./data/result.csv', index=False)