# Imports

In [1]:
import pandas as pd
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from lazypredict.Supervised import LazyClassifier


%load_ext autoreload
%autoreload 2

#  Read Data

In [2]:
train = pd.read_csv("raw_data/train.csv")
test = pd.read_csv("raw_data/test.csv")


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train[pd.isna(train.text)]

Unnamed: 0,id,keyword,location,text,target


In [5]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Preprocess data

In [6]:
def preprocessing(sentence):
    # remove whitespace
    [text.strip() for text in sentence]
    
    # lowercase characters
    sentence = sentence.lower()
    
    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # remove punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') 
    
    #tokenize
    sentence_tokens = word_tokenize(sentence)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    sentence_tokens = [w for w in sentence_tokens if not w in stop_words] 
    
    # Lemmatizing the verbs
    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in sentence_tokens]

    # 2 - Lemmatizing the nouns
    sentence_preprocessed = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
              for word in verb_lemmatized]
    
    return ' '.join(word for word in sentence_preprocessed)

In [7]:
train["clean_text"] = train.text.apply(preprocessing)

In [8]:
train.clean_text.head()

0           deed reason earthquake may allah forgive u
1                forest fire near la ronge sask canada
2    resident ask shelter place notify officer evac...
3    people receive wildfire evacuation order calif...
4    get send photo ruby alaska smoke wildfire pour...
Name: clean_text, dtype: object

In [9]:
# Feature/Target
X = train["clean_text"]
y = train["target"]

# Baseline Model

In [17]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=80)

In [18]:
# Pipeline vectorizer + Naive Bayes
pipeline_nb = make_pipeline(
    TfidfVectorizer(), 
    MultinomialNB()
)

# Cross-validation
cv_results = cross_validate(pipeline_nb, X_train, y_train, cv = 5, scoring = ["recall"])
average_recall = cv_results["test_recall"].mean()
np.round(average_recall,2)

0.64

The result of the baseline model - a simple multinomial naiva bayes algorithm based on tfidf vectorizer - returns a result of 0.64

In [19]:
pipeline_nb.fit(X_train, y_train)

In [20]:
pipeline_nb.score(X_test,y_test)

0.8021015761821366

In [21]:
pipeline_nb.get_params()

{'memory': None,
 'steps': [('tfidfvectorizer', TfidfVectorizer()),
  ('multinomialnb', MultinomialNB())],
 'verbose': False,
 'tfidfvectorizer': TfidfVectorizer(),
 'multinomialnb': MultinomialNB(),
 'tfidfvectorizer__analyzer': 'word',
 'tfidfvectorizer__binary': False,
 'tfidfvectorizer__decode_error': 'strict',
 'tfidfvectorizer__dtype': numpy.float64,
 'tfidfvectorizer__encoding': 'utf-8',
 'tfidfvectorizer__input': 'content',
 'tfidfvectorizer__lowercase': True,
 'tfidfvectorizer__max_df': 1.0,
 'tfidfvectorizer__max_features': None,
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__ngram_range': (1, 1),
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__preprocessor': None,
 'tfidfvectorizer__smooth_idf': True,
 'tfidfvectorizer__stop_words': None,
 'tfidfvectorizer__strip_accents': None,
 'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidfvectorizer__tokenizer': None,
 'tfidfvectorizer__use_idf': True,
 'tfidfvectorizer__vocab

# Hyperparameter Tuning

In [33]:
# Set parameters to search
parameters = {
    'tfidfvectorizer__min_df':(0, 0.5, 1.0, 1.5, 2.0, 2.5),
    'tfidfvectorizer__max_df':(0, 0.5, 1.0, 1.5, 2.0, 2.5),
    'tfidfvectorizer__ngram_range': ((1,1), (2,2), (1,2)),
    'multinomialnb__alpha': (0.1,0.5,1,1.5, 2, 5)}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline_nb, parameters,
                           cv = 5, n_jobs=-1, verbose=1, scoring = "f1")

grid_search.fit(X_train, y_train)

# Best score
print(f"Best Score = {grid_search.best_score_}")

# Best params
print(f"Best params = {grid_search.best_params_}")

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best Score = 0.7332854288270495
Best params = {'multinomialnb__alpha': 0.1, 'tfidfvectorizer__max_df': 0.5, 'tfidfvectorizer__min_df': 0, 'tfidfvectorizer__ngram_range': (1, 2)}


In [35]:
grid_search.score(X_test, y_test)

0.7347391786903441

In [38]:
test["clean_text"] = test.text.apply(preprocessing)

In [39]:
test["target"] = grid_search.predict(test["clean_text"])

In [58]:
res = pd.DataFrame(test[["id", "target"]])
res = res.set_index("id", drop=True)

In [59]:
print(res)

       target
id           
0           1
2           0
3           1
9           1
11          1
...       ...
10861       1
10865       1
10868       1
10874       1
10875       1

[3263 rows x 1 columns]


In [60]:
res.to_csv("result.csv")

# lazypredict

In [15]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(min_df = 0.005)

# Training it on the texts
weighted_words = pd.DataFrame(tf_idf_vectorizer.fit_transform(train.clean_text).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

weighted_words

Unnamed: 0,accident,affect,air,also,always,ambulance,amp,another,area,army,...,wound,wreck,wreckage,year,youre,youtube,yr,ûª,ûªs,ûò
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7609,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7610,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7611,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [16]:
X_train, X_test, y_train, y_test = train_test_split(weighted_words, y, test_size=.5,random_state =123)

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)

models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

100%|███████████████████████████████████████████| 29/29 [00:37<00:00,  1.31s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
NuSVC                              0.77               0.75     0.75      0.76   
SVC                                0.77               0.75     0.75      0.76   
BernoulliNB                        0.76               0.74     0.74      0.75   
ExtraTreesClassifier               0.75               0.74     0.74      0.75   
LogisticRegression                 0.75               0.74     0.74      0.75   
LinearDiscriminantAnalysis         0.75               0.74     0.74      0.75   
RidgeClassifierCV                  0.75               0.74     0.74      0.75   
RidgeClassifier                    0.75               0.74     0.74      0.75   
CalibratedClassifierCV             0.76               0.74     0.74      0.75   
NearestCentroid                    0.75               0.74     0.74      0.75   
XGBClassifier               




Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


# Word Embedding