# 1. Binary Classification on Text Data
## (a) Download the data

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Get the training and testing data
df_train = pd.read_csv('./hw2-data/train.csv') # df = data frame
df_test  = pd.read_csv('./hw2-data/test.csv')

df_train.head() # limit by 5 rows 

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [117]:
# Check the number of training data
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [118]:
# Check the number of test data
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [119]:
# Check the percentage of the training tweets that are real disasters
x = df_train.target
print(x.value_counts(1))

0    0.57034
1    0.42966
Name: target, dtype: float64


## (b) Split the training data

In [120]:
from sklearn.model_selection import train_test_split

train, develop = train_test_split(df_train, train_size=0.7, test_size=0.3, random_state=100)

In [121]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5329 entries, 2345 to 5640
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5329 non-null   int64 
 1   keyword   5290 non-null   object
 2   location  3583 non-null   object
 3   text      5329 non-null   object
 4   target    5329 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 249.8+ KB


In [122]:
develop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2284 entries, 3999 to 3347
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2284 non-null   int64 
 1   keyword   2262 non-null   object
 2   location  1497 non-null   object
 3   text      2284 non-null   object
 4   target    2284 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 107.1+ KB


In [123]:
train.head()

Unnamed: 0,id,keyword,location,text,target
2345,3373,demolition,,General News Û¢åÊ'Demolition of houses on wat...,0
6112,8726,sinking,HOMRA.,In your eyes I see the hope\nI once knew.\nI'm...,0
5764,8226,riot,United Kingdom,'Without an ally near you can't use this skill...,0
3591,5130,fatal,Thane,11-Year-Old Boy Charged With Manslaughter of T...,1
1175,1692,bridge%20collapse,,2 Injured 1 missing in bridge collapse in cent...,1


## (c) Preprocess the data

In [124]:
import string

# Convert all the words to lowercase
train['text'] = train['text'].str.lower()
develop['text'] =develop['text'].str.lower()

# Strip URLs since URLs are irrelevant to disaster information
train['text'] = train['text'].str.replace(r'http\S+|www.\S+', ' ', regex=True)
develop['text'] = develop['text'].str.replace(r'http\S+|www.\S+', ' ', regex=True)

# Strip punctuation since punctuations are irrelevant to disaster information
train['text'] = train['text'].str.replace(f'[{string.punctuation}]', ' ', regex=True)
develop['text'] = develop['text'].str.replace(f'[{string.punctuation}]', ' ', regex=True)

# Strip \n since \n are irrelevant to disaster information
train['text'] = train['text'].str.replace(f'\n', ' ', regex=True)
develop['text'] = develop['text'].str.replace(f'\n', ' ', regex=True)

# Remove Non-English & numbers characters since we can't distinguish the severity of disaster based on numbers and non english words
non_english = str.maketrans("", "", "0123456789!@#$%^&*()_+-=[]{}\\|;:'\",./<>?`~¡¢£¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ")
train['text'] = train['text'].str.translate(non_english)
develop['text'] = develop['text'].str.translate(non_english)

In [125]:
train.head()

Unnamed: 0,id,keyword,location,text,target
2345,3373,demolition,,general news  demolition of houses on waterwa...,0
6112,8726,sinking,HOMRA.,in your eyes i see the hope i once knew i m s...,0
5764,8226,riot,United Kingdom,without an ally near you can t use this skill...,0
3591,5130,fatal,Thane,year old boy charged with manslaughter of tod...,1
1175,1692,bridge%20collapse,,injured missing in bridge collapse in centra...,1


In [126]:
# Implement Lemmatization
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

# For training dset
num_train = train.shape[0]
for i in range(0, num_train):
    # Find the text at each row
    sentence = train.iloc[i, 3]

    # Split the sentence and make the word list
    tokens = sentence.split()
    for j in range(0, len(tokens)):
        tokens[j] = lemmatizer.lemmatize(tokens[j], pos='v')

    # Make the sentence again and put it in the train data
    train.iloc[i, 3] = ' '.join(tokens)

# For development set
num_develop = develop.shape[0]
for i in range(0, num_develop):
    # Find the text at each row
    sentence = develop.iloc[i, 3]

    # Split the sentence and make the word list
    tokens = sentence.split()
    for j in range(0, len(tokens)):
        tokens[j] = lemmatizer.lemmatize(tokens[j], pos='v')

    # Make the sentence again and put it in the train data
    develop.iloc[i, 3] = ' '.join(tokens)

In [127]:
train.head()

Unnamed: 0,id,keyword,location,text,target
2345,3373,demolition,,general news  demolition of house on waterway...,0
6112,8726,sinking,HOMRA.,in your eye i see the hope i once know i m sin...,0
5764,8226,riot,United Kingdom,without an ally near you can t use this skill ...,0
3591,5130,fatal,Thane,year old boy charge with manslaughter of toddl...,1
1175,1692,bridge%20collapse,,injure miss in bridge collapse in central mexi...,1


## (d) Bag of words model

In [128]:
# Make the binary "bag of words" model
from sklearn.feature_extraction.text import CountVectorizer

# Implement the text list
train_text = []
for text in train['text']:
    train_text.append(text)

M = 10 # Include in the vocabulary words that appear in at least M different tweets
vectorizer = CountVectorizer(binary=True, min_df=M)
X = vectorizer.fit_transform(train_text)
bow = vectorizer.get_feature_names_out()
len_bow = len(bow)

print("BOW Size:", len(bow)) # Size of the "bag of words"

BOW Size: 1105


In [129]:
X_bow_train = X.toarray()

# print(bow)
print(X_bow_train.shape)

(5329, 1105)


In [130]:
# Find the array for the development set
X_bow_develop = np.zeros((num_develop, len_bow))

for i in range(0, num_develop):
    sentence = develop.iloc[i, 3]
    X_bow_develop[i, :] = [(word in sentence.split()) for word in bow]
  
print(X_bow_develop.shape)

(2284, 1105)


## (e) Logistic regression

### i-iii. Train a logistic regression model

In [131]:
# Implement logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Instantiate the model
log_no_reg = LogisticRegression(penalty=None, random_state=16)
lr_l1 = LogisticRegression(penalty='l1', solver='saga', max_iter=16)
lr_l2 = LogisticRegression(penalty='l2', random_state=16)

y_train = train['target']
y_develop = develop['target']
# print(y_train.shape)

# Fit the model with data
log_no_reg.fit(X_bow_train, y_train)
lr_l1.fit(X_bow_train, y_train)
lr_l2.fit(X_bow_train, y_train)

# Predict and calculate F1 score for training and development sets
f1_train_no_reg = f1_score(y_train, log_no_reg.predict(X_bow_train))
f1_dev_no_reg = f1_score(y_develop, log_no_reg.predict(X_bow_develop))

f1_train_l1 = f1_score(y_train, lr_l1.predict(X_bow_train))
f1_dev_l1 = f1_score(y_develop, lr_l1.predict(X_bow_develop))

f1_train_l2 = f1_score(y_train, lr_l2.predict(X_bow_train))
f1_dev_l2 = f1_score(y_develop, lr_l2.predict(X_bow_develop))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [132]:
print(f1_train_no_reg)
print(f1_train_l1)
print(f1_train_l2)

print(f1_dev_no_reg)
print(f1_dev_l1)
print(f1_dev_l2)
#The best classifier for the training set is no regulation and for the test(development) set is l2 with the highest f1 score in each sector. 


0.8677225072496098
0.825271927794492
0.8393186003683242
0.720682302771855
0.7412353923205341
0.7409040793825801


### v. Inspect the weight vector of the classifier with L1 regularization

In [133]:
#print(lr_l1.coef_.shape)

argmax = lr_l1.coef_.argmax()
argmin = lr_l1.coef_.argmin()

print("Word that has the maximum positive coefficient:", bow[argmax])
print("Its coefficient:", lr_l1.coef_[0, argmax])
print("Word that has the maximum negative coefficient:", bow[argmin])
print("Its coefficient:", lr_l1.coef_[0, argmin])

Word that has the maximum positive coefficient: wildfire
Its coefficient: 3.2754598432875275
Word that has the maximum negative coefficient: such
Its coefficient: -2.1602140271068238


In [134]:
print(argmax)
print(lr_l1.coef_)

1065
[[ 0.          0.14569677  0.21953019 ... -0.49492731  0.
  -0.38761053]]


## (f) Bernoulli Naive Bayes

In [135]:
# Calculate the probability P(y=0), P(y=1) at training data
n_y0 = sum(y_train == 0)
n_y1 = sum(y_train == 1)
p_y0 = sum(y_train == 0) / num_train
p_y1 = sum(y_train == 1) / num_train
 
# Calcaulte the P(x_j=1 | y=k) with Laplace smoothing
prob_ber = np.zeros((2, len(bow)))

for j in range(0, len(bow)):
    n_j0 = 0
    n_j1 = 0

    # Calculate the n_jk
    for i in range(0, num_train):
        if X_bow_train[i, j] == 1:
            if y_train.iloc[i] == 1:
                n_j1 = n_j1 + 1
            else:
                n_j0 = n_j0 + 1
        
    prob_ber[0, j] = (n_j0+1) / (n_y0+2)
    prob_ber[1, j] = (n_j1+1) / (n_y1+2)
                
print(prob_ber)

[[0.00198544 0.00066181 0.00463269 ... 0.0198544  0.00099272 0.0049636 ]
 [0.0034617  0.00908698 0.00519256 ... 0.00562527 0.00908698 0.00259628]]


In [136]:
# Calculate the conditional probability at the development set
# P(y|x_1, x_2, ..., x_n) ~ P(y) * P(x_1|y) * P(x_2|y) * .... * P(x_n|y)
y_develop_ber = np.array([])

for i in range(0, num_develop):
    p_y0_dev = p_y0
    p_y1_dev = p_y1

    for j in range(0, len(bow)):
        if X_bow_develop[i, j] == 1:
            p_y0_dev = p_y0_dev * prob_ber[0, j]
            p_y1_dev = p_y1_dev * prob_ber[1, j]
        
        if X_bow_develop[i, j] == 0:
            p_y0_dev = p_y0_dev * (1 - prob_ber[0, j])
            p_y1_dev = p_y1_dev * (1 - prob_ber[1, j])

    if p_y0_dev > p_y1_dev:
        y_develop_ber = np.append(y_develop_ber, [0])
    else:
        y_develop_ber = np.append(y_develop_ber, [1])

In [137]:
# Calculate the F1-score
f1_develop_ber = f1_score(y_develop, y_develop_ber)
print(f1_develop_ber)

0.743502824858757


## (h) N-gram model

In [138]:
# Get 2-grams
M = 10 # Include in the vocabulary words that appear in at least M different tweets
vectorizer_2 = CountVectorizer(binary=True, min_df=M, ngram_range=(2,2))
X = vectorizer_2.fit_transform(train_text)
vec_2gram = vectorizer_2.get_feature_names_out()

In [139]:
print(len(vec_2gram))
print(vec_2gram[0:10])

532
['abc news' 'about the' 'about to' 'about trap' 'affect by' 'after the'
 'after wave' 'air ambulance' 'airplane accident' 'airport get']


In [140]:
# Achieve 2-grams again with large min_df
M = 5 # Include in the vocabulary words that appear in at least M different tweets
vectorizer_2 = CountVectorizer(binary=True, min_df=M, ngram_range=(2,2))
X2 = vectorizer_2.fit_transform(train_text)
vec_2gram = vectorizer_2.get_feature_names_out()
# print(len(vec_2gram))

In [141]:
# Merge 2-grams and 1-grams
X_only_2gram_train = X2.toarray()
X_2gram_train = np.append(X_bow_train, X_only_2gram_train, axis=1)
# print(X_2gram_train.shape)

In [142]:
# Find the 2-gram array for the development set
X_only_2gram_develop = np.zeros((num_develop, len(vec_2gram)))

for i in range(0, num_develop):
    sentence = develop.iloc[i, 3]
    X_only_2gram_develop[i, :] = [(word in sentence) for word in vec_2gram]
    
X_2gram_develop = np.append(X_bow_develop, X_only_2gram_develop, axis=1)

In [143]:
# Implement logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Instantiate the model
lr_l2_2gram = LogisticRegression(penalty='l2', random_state=16)

# Fit the model with data
lr_l2_2gram.fit(X_2gram_train, y_train)

# Predict and calculate F1 score for training and development sets
f1_train_l2_2gram = f1_score(y_train, lr_l2_2gram.predict(X_2gram_train))
f1_dev_l2_2gram = f1_score(y_develop, lr_l2_2gram.predict(X_2gram_develop))

In [144]:
print(f1_train_l2_2gram)
print(f1_dev_l2_2gram)

0.8843258042436687
0.7441601779755284


In [145]:
# Implement a Bernoulli Naive Bayes classifier
# Calculate the probability P(y=0), P(y=1) at training data
n_y0 = sum(y_train == 0)
n_y1 = sum(y_train == 1)
p_y0 = sum(y_train == 0) / num_train
p_y1 = sum(y_train == 1) / num_train
 
# Calcaulte the P(x_j=1 | y=k) with Laplace smoothing
len_2gram = len(bow) + len(vec_2gram)
prob_ber_2gram = np.zeros((2, len_2gram))

for j in range(0, len_2gram):
    n_j0 = 0
    n_j1 = 0

    # Calculate the n_jk
    for i in range(0, num_train):
        if X_2gram_train[i, j] == 1:
            if y_train.iloc[i] == 1:
                n_j1 = n_j1 + 1
            else:
                n_j0 = n_j0 + 1
        
    prob_ber_2gram[0, j] = (n_j0+1) / (n_y0+2)
    prob_ber_2gram[1, j] = (n_j1+1) / (n_y1+2)
                
print(prob_ber_2gram)

[[0.00198544 0.00066181 0.00463269 ... 0.00264725 0.00959629 0.00033091]
 [0.0034617  0.00908698 0.00519256 ... 0.00086543 0.00259628 0.00865426]]


In [146]:
# Calculate the conditional probability at the development set
# P(y|x_1, x_2, ..., x_n) ~ P(y) * P(x_1|y) * P(x_2|y) * .... * P(x_n|y)
y_develop_ber_2gram = np.array([])

for i in range(0, num_develop):
    p_y0_dev = p_y0
    p_y1_dev = p_y1

    for j in range(0, len_2gram):
        if X_2gram_develop[i, j] == 1:
            p_y0_dev = p_y0_dev * prob_ber_2gram[0, j]
            p_y1_dev = p_y1_dev * prob_ber_2gram[1, j]
        
        if X_2gram_develop[i, j] == 0:
            p_y0_dev = p_y0_dev * (1 - prob_ber_2gram[0, j])
            p_y1_dev = p_y1_dev * (1 - prob_ber_2gram[1, j])

    if p_y0_dev > p_y1_dev:
        y_develop_ber_2gram = np.append(y_develop_ber_2gram, [0])
    else:
        y_develop_ber_2gram = np.append(y_develop_ber_2gram, [1])

In [147]:
# Calculate the F1-score
f1_develop_ber = f1_score(y_develop, y_develop_ber_2gram)
print(f1_develop_ber)

0.7284848484848484


## (i) Determine performance with the test set

In [148]:

# Convert all the words to lowercase
df_test['text'] = df_test['text'].str.lower()

# Strip URLs
df_test['text'] = df_test['text'].str.replace(r'http\S+|www.\S+', ' ', regex=True)

# Strip punctuation
df_test['text'] = df_test['text'].str.replace(f'[{string.punctuation}]', ' ', regex=True)

# Strip \n
df_test['text'] = df_test['text'].str.replace(f'\n', ' ', regex=True)

# Remove Non-English characters
non_english = str.maketrans("", "", "0123456789!@#$%^&*()_+-=[]{}\\|;:'\",./<>?`~¡¢£¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ")
df_test['text'] = df_test['text'].str.translate(non_english)


In [149]:
# Find the array for the development set
num_test = df_test.shape[0]
X_bow_test = np.zeros((num_test, len_bow))

for i in range(0, num_test):
    sentence = df_test.iloc[i, 3]
    X_bow_test[i, :] = [(word in sentence.split()) for word in bow]
  
print(X_bow_test.shape)

(3263, 1105)


In [150]:
y_test = lr_l1.predict(X_bow_test)

df_predict['target'] = pd.DataFrame(y_test).rename(columns={0:'target'})
df_predict['id'] = df_test['id']

print(y_test)
print(df_predict)
df_predict.to_csv('./hw2-data/test_predict.csv', index=False)

[1 1 1 ... 1 1 0]
      target     id
0          1      0
1          1      2
2          1      3
3          0      9
4          1     11
...      ...    ...
3258       1  10861
3259       1  10865
3260       1  10868
3261       1  10874
3262       0  10875

[3263 rows x 2 columns]
