In [27]:
import re

import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [21]:
# Read dataset
df_dataset = pd.read_csv('train.csv')

df_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [22]:
# Check balance of classes
df_dataset.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [17]:
# Clean text
pattern_url = re.compile(r'https?://\S+|www\.\S+')
pattern_html = re.compile(r'<.*?>')

def clean_text(text):
    cleaned_text = text.strip()
    cleaned_text = pattern_url.sub(r'', cleaned_text)
    cleaned_text = cleaned_text.replace('#', '')
    
    return cleaned_text

df_dataset['text'] = df_dataset['text'].apply(clean_text)

In [26]:
pattern_html = emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

counter = 0
for idx,row in df_dataset.iterrows():
    if bool(pattern_html.search(row['text'])):
        counter += 1
        
counter

0

In [8]:
# Get only the texts and targets
df_train = df_dataset[['text','target']]

In [9]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df_train.text, df_train.target, train_size=0.80, stratify=df_train.target)

In [10]:
# Declare model
pipeline = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,3), min_df=5, max_df=0.99)),
    # ('svd', TruncatedSVD(n_components=400)),
    ('clf', MultinomialNB()),
    # ('clf', LinearSVC(class_weight='balanced'))
])

In [11]:
%time scores = cross_val_score(pipeline, df_train['text'], df_train["target"], cv=3, scoring="f1")
scores

CPU times: user 617 ms, sys: 20.3 ms, total: 637 ms
Wall time: 636 ms


array([0.62266227, 0.55911718, 0.68630339])

In [12]:
# Train the model
%time pipeline.fit(X=X_train, y=y_train)

CPU times: user 215 ms, sys: 9.46 ms, total: 225 ms
Wall time: 226 ms


Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.99, min_df=5, ngram_range=(1, 3),
                                 stop_words='english')),
                ('clf', MultinomialNB())])

In [13]:
# Predict on testing dataset
y_pred = pipeline.predict(X_test)
y_true = y_test.to_numpy()

In [14]:
# Print the evaluation results
print(f'Confusion matrix:\n{confusion_matrix(y_true=y_true, y_pred=y_pred)}')
print()
print(f'accuracy = {100 * accuracy_score(y_true=y_true, y_pred=y_pred):.2f}%')  # 77.54%
print(f'f1_score = {100 * f1_score(y_true=y_true, y_pred=y_pred):.2f}%')        # 73.53%

Confusion matrix:
[[791  78]
 [253 401]]

accuracy = 78.27%
f1_score = 70.79%


In [12]:
# Submit result
df_submission_in = pd.read_csv('test.csv')

x_sub = df_submission_in['text'].apply(clean_text)

y_pred_sub = pipeline.predict(x_sub)

In [13]:
df_submission_out = pd.read_csv('sample_submission.csv')

df_submission_out['target'] = y_pred_sub

df_submission_out.to_csv('submission.csv', index=False)