In [63]:
import pandas as pd
import preprocessor as prep_t
import numpy as np
import spacy
import string
import contractions
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Goal
The aim of this notebook is to model a classifier using basinc machine learning methods and algorithms to compare its performance with more sophisticated methods/ model such as BERT

# Data set

**Load**

In [26]:
df = pd.read_csv('../data/train.csv')

In [27]:
df_mislabeled = df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']

**Filter mislabeled data**

Relabel manually mislabeled data

In [28]:
df['target_relabeled'] = df['target'].copy() 
df.loc[df['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
df.loc[df['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
df.loc[df['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
df.loc[df['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
df.loc[df['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
df.loc[df['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
df.loc[df['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
df.loc[df['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
df.loc[df['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
df.loc[df['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
df.loc[df['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
df.loc[df['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
df.loc[df['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
df.loc[df['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
df.loc[df['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
df.loc[df['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
df.loc[df['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
df.loc[df['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0

In [29]:
df = df.drop(columns=['target'])
df = df.rename(columns={'target_relabeled':'target'})

# Preprocessing

## Clean text

Remove \x89

In [30]:
df['cleaned_text'] = df['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))

Clean tweets using tweet preprocessing package: remove url, mention and emoji

In [31]:
prep_t.set_options(prep_t.OPT.URL, prep_t.OPT.EMOJI, prep_t.OPT.MENTION)

df['cleaned_text'] = df['cleaned_text'].map(prep_t.clean)

Remov consecutive punctuation

In [32]:
consequitivedots = re.compile(r'\.{2,}')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: consequitivedots.sub(' ... ', x))

consequitivemarks = re.compile(r'\?{2,}')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: consequitivedots.sub('???', x))

consequitivemarks = re.compile(r'\!{2,}')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: consequitivedots.sub('!!!', x))

Remove non ascii string

In [33]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text)

In [34]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: remove_non_ascii(x))

Remove some special charcters and lower strings

In [35]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.replace('#', ' ').replace('@', " "))

df['cleaned_text'] = df['cleaned_text'].apply(lambda x: re.sub('[^\wÀ-ÿ\%\'\.\,\-]', ' ', x))

df['cleaned_text'] = df['cleaned_text'].str.lower()

Apply decontraction

In [36]:
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: contractions.fix(x))

Remove spaces

In [37]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x.split()))
df['cleaned_text'] = df['cleaned_text'].str.strip()

Lemmatize the cleaned text

In [38]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])

def lemmatization(text, nlp):
    doc = nlp(text)
    return " ".join([d.lemma_ for d in doc])

df['cleaned_text'] = df['cleaned_text'].apply(lambda x: lemmatization(x, nlp))



## Missing values

Fill missing values by an empty string

In [39]:
df = df.fillna('')

# Feature engineering

## Keywords

221 unique keywords

In [40]:
df.keyword.nunique()

222

Create a set of keywords that are frequently present in real disaster tweets

In [41]:
tmp = df.groupby('keyword')['target'].mean().reset_index()
tmp = tmp.sort_values('target', ascending = False)
true_keywords = tmp[tmp.target > 0.5].keyword.values

Create a binary column if the keyword blongs to the set or not

In [42]:
df['is_true_keyword'] =  df['keyword'].isin(true_keywords)

## Location

3341 unique locations

In [43]:
df.location.nunique()

3342

Clean some location values

In [44]:
df.loc[df.location =='M!$$!$$!PP!', 'location'] = "MISSISIPPI"

In [45]:
nlp = spacy.load("en_core_web_sm")



In [46]:
def extract_gpe(text):
    doc = nlp(text)
    
    for ent in doc.ents:
        if ent.label_ == 'GPE':
            return ent.text
    

In [47]:
df.loc[df.location.notnull(), 'gpe_extracted'] = df.loc[df.location.notnull(), 'location'].map(extract_gpe)

In [48]:
df.loc[df.location.isin(['USA', 'Wordwide']), 'gpe_extracted'] =  df.loc[df.location.isin(['USA', 'Wordwide']), 'location']

Create a set of locations that are frequently present in real disaster tweets

In [55]:
tmp = df.groupby('gpe_extracted')['target'].mean().reset_index()

tmp = tmp.sort_values('target', ascending = False)

true_gpe_extracted = tmp[tmp.target > 0.5].gpe_extracted.values

Create a binary column if the keyword blongs to the set or not

In [56]:
df['true_gpe_extracted'] = df['gpe_extracted'].isin(true_gpe_extracted)

# Meta data

Create meta data variables such as:
* Count of used words in a tweet
* Mean length of words
* Count of characters in a tweet
* Count of punctuation characters
* Count of hastags in a tweet
* Count of mentions in a tweet 
* Count of url links
* Length ouf url links

In [51]:
# word_count
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

# mean_word_length
df['mean_word_length'] = df['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# characters counts 
df['char_count'] = df['text'].apply(lambda x: len(str(x)))

# punctuation count
df['punctuation_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# hastag count
df['hashtag_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# mention count
df['mention_count'] = df['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

# url count
df['url_count'] = df['text'].apply(lambda x: len(prep_t.parse(x).urls) if np.all(pd.notnull(prep_t.parse(x).urls)) else 0)

In [52]:
def length_url(text):
    parsed_tweet = prep_t.parse(text)
    c= 0
    if np.any(pd.notnull(parsed_tweet.urls)):
        for i in range(len(parsed_tweet.urls)):
            c = c+ len(parsed_tweet.urls[i].match)
    else:
        pass
    
    return c

In [53]:
# url length
df['len_url'] = df['text'].apply(lambda x: length_url(x))

In [54]:
df['len_cleaned_text'] = df['cleaned_text'].map(len)

# Machine learning

Select columns

In [59]:
df_train = df[['target', 'is_true_keyword',
       'gpe_extracted', 'word_count', 'mean_word_length', 'char_count',
       'punctuation_count', 'hashtag_count', 'mention_count', 'url_count',
       'len_url', 'cleaned_text', 'len_cleaned_text', 'true_gpe_extracted']].copy()

In [60]:
df_train['is_true_keyword'] = df_train['is_true_keyword'].map(int) 
df_train['true_gpe_extracted'] =df_train['true_gpe_extracted'].map(int)

load stopwords

In [61]:
nltk.download("stopwords")
stop = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/sang-
[nltk_data]     hoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## TF-IDF

In [64]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.33, min_df = 0.005, stop_words=stop)

X = vectorizer.fit_transform(df_train.cleaned_text.values)
X = np.concatenate((X.toarray(), df_train.drop(columns=['target', 'cleaned_text', 'gpe_extracted']).to_numpy()), axis = 1)

y = df_train.target.to_numpy()

In [65]:
X.shape

(7613, 345)

**Cross validation**

Calculate f1-score using cross validation to detect if the model is overfitted or not (calculate mean and standard deviation f1-score over each fold )

In [66]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=7)
scores = cross_val_score(clf, X, y, cv=4, scoring='f1')
scores

array([0.6       , 0.67762688, 0.68139224, 0.69308357])

In [67]:
print("F1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

F1: 0.66 (+/- 0.07)


## Fit

Here the parameters are arbitrary. It is possible to do the hyper parameter tuning using grid search, bayesian optimization, ...

In [68]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=7)

clf.fit(X,y)

RandomForestClassifier(max_depth=7, n_jobs=-1)

# Prediction

**Preparation**

Test set

In [69]:
df_test = pd.read_csv('../data/test.csv')

In [70]:
df_test['is_true_keyword'] =  df_test['keyword'].isin(true_keywords)

nlp = spacy.load("en_core_web_sm")

df_test.loc[df_test.location.notnull(), 'gpe_extracted'] = df_test.loc[df_test.location.notnull(), 'location'].map(extract_gpe)

df_test.loc[df_test.location.isin(['USA', 'Wordwide']), 'gpe_extracted'] =  df_test.loc[df_test.location.isin(['USA', 'Wordwide']), 'location']

df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

df_test['mean_word_length'] = df_test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

df_test['char_count'] = df_test['text'].apply(lambda x: len(str(x)))

df_test['punctuation_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

df_test['mention_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

df_test['url_count'] = df_test['text'].apply(lambda x: len(prep_t.parse(x).urls) if np.all(pd.notnull(prep_t.parse(x).urls)) else 0)

df_test['len_url'] = df_test['text'].apply(lambda x: length_url(x))

df_test = df_test.fillna('')

df_test['cleaned_text']=df_test['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))

prep_t.set_options(prep_t.OPT.URL, prep_t.OPT.EMOJI, prep_t.OPT.MENTION)

df_test['cleaned_text'] = df_test['cleaned_text'].map(prep_t.clean)

consequitivedots = re.compile(r'\.{2,}')
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: consequitivedots.sub(' ... ', x))

consequitivemarks = re.compile(r'\?{2,}')
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: consequitivedots.sub('???', x))

consequitivemarks = re.compile(r'\!{2,}')
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: consequitivedots.sub('!!!', x))

df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: remove_non_ascii(x))

df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: x.replace('#', ' ').replace('@', " "))

df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: re.sub('[^\wÀ-ÿ\%\'\.\,\-]', ' ', x))

df_test['cleaned_text'] = df_test['cleaned_text'].str.lower()

df_test["cleaned_text"] = df_test["cleaned_text"].apply(lambda x: contractions.fix(x))

df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: ' '.join(x.split()))

df_test['len_cleaned_text'] = df_test['cleaned_text'].map(len)

df_test['true_gpe_extracted'] = df_test['gpe_extracted'].isin(true_gpe_extracted)

df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: lemmatization(x, nlp))



In [71]:
df_test = df_test[['is_true_keyword',
       'gpe_extracted', 'word_count', 'mean_word_length', 'char_count',
       'punctuation_count', 'hashtag_count', 'mention_count', 'url_count',
       'len_url', 'cleaned_text', 'len_cleaned_text', 'true_gpe_extracted']].copy()

In [72]:
df_test['is_true_keyword'] = df_test['is_true_keyword'].map(int) 

In [73]:
df_test['true_gpe_extracted'] =df_test['true_gpe_extracted'].map(int)

## Predict

In [74]:
X = vectorizer.transform(df_test.cleaned_text.values)

In [75]:
X = np.concatenate((X.toarray(), df_test.drop(columns=['cleaned_text', 'gpe_extracted']).to_numpy()), axis = 1)

In [76]:
predictions = clf.predict(X)

## Save predictions

In [477]:
#output = pd.read_csv('./../data/test.csv')
#
#output = output[['id']]
#
#output['target'] = predictions
#output[['id', 'target']].to_csv('submissions_rf.csv', sep=",", index = False)