In [1]:
import pandas as pd
import string
import re
from nltk.tokenize import word_tokenize



In [2]:
df = pd.read_csv('train.csv', index_col='id')

In [3]:
df['text'] = df['text'].apply(lambda x: x.lower())
def find_hashtags(text):
    return tuple(re.findall(r'#[\w]+', text))
def find_tagged(text):
    return tuple(re.findall(r'@[\w]+', text))
def find_urls(text):
    return tuple(re.findall(r'https?://(?:www\.)?\S+|www\.\S+', text))
df['hashtag'] = df['text'].apply(lambda x: find_hashtags(x))
df['tagged'] = df['text'].apply(lambda x: find_tagged(x))
df['url'] = df['text'].apply(lambda x: find_urls(x))
df['text_cleaned'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
df['tokens'] = df['text_cleaned'].apply(lambda x: tuple(word_tokenize(x)))

In [4]:
def get_ngrams(tokens, n):
    return tuple([tokens[i: i + n] for i in range(len(tokens) - n)])

In [5]:
df['bigrams'] = df['tokens'].apply(lambda x: get_ngrams(x, 2))
df['trigrams'] = df['tokens'].apply(lambda x: get_ngrams(x, 3))

In [6]:
df_disaster = df[df['target'] == 1]
df_ordinary = df[df['target'] == 0]

In [7]:
def get_top_values(dataframe, column, num_values=50):
    return pd.value_counts(
        list(sum(dataframe[column].values, ()))
    )[:num_values]
top_hashtags_disaster = get_top_values(df_disaster, 'hashtag')
top_hashtags_ordinary = get_top_values(df_ordinary, 'hashtag')
top_tokens_disaster = get_top_values(df_disaster, 'tokens')
top_tokens_ordinary = get_top_values(df_ordinary, 'tokens')
top_tokens_disaster = [token for token in top_tokens_disaster.keys() if token not in top_tokens_ordinary.keys()]
top_tokens_ordinary = [token for token in top_tokens_ordinary.keys() if token not in top_tokens_disaster]

In [8]:
df['has_disaster_hashtag'] = df['hashtag'].apply(lambda x: bool(sum([elt in top_hashtags_disaster for elt in x])))
df['has_ordinary_hashtag'] = df['hashtag'].apply(lambda x: bool(sum([elt in top_hashtags_ordinary for elt in x])))
df['has_disaster_token'] = df['tokens'].apply(lambda x: bool(sum([elt in top_tokens_disaster for elt in x])))
df['has_ordinary_token'] = df['tokens'].apply(lambda x: bool(sum([elt in top_tokens_ordinary for elt in x])))

In [9]:
df.groupby(['has_ordinary_hashtag', 'has_disaster_hashtag']).mean()

  df.groupby(['has_ordinary_hashtag', 'has_disaster_hashtag']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,target,has_disaster_token,has_ordinary_token
has_ordinary_hashtag,has_disaster_hashtag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,False,0.420306,0.278928,0.914209
False,True,0.941441,0.423423,0.927928
True,False,0.098958,0.135417,0.854167
True,True,0.537415,0.530612,0.870748


In [10]:
df['has_only_disaster_hashtag'] = df['has_disaster_hashtag'] & (df['has_ordinary_hashtag'] == False)
df['has_only_ordinary_hashtag'] = df['has_ordinary_hashtag'] & (df['has_disaster_hashtag'] == False)

In [11]:
df

Unnamed: 0_level_0,keyword,location,text,target,hashtag,tagged,url,text_cleaned,tokens,bigrams,trigrams,has_disaster_hashtag,has_ordinary_hashtag,has_disaster_token,has_ordinary_token,has_only_disaster_hashtag,has_only_ordinary_hashtag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,,,our deeds are the reason of this #earthquake m...,1,"(#earthquake,)",(),(),our deeds are the reason of this earthquake ma...,"(our, deeds, are, the, reason, of, this, earth...","((our, deeds), (deeds, are), (are, the), (the,...","((our, deeds, are), (deeds, are, the), (are, t...",True,False,False,True,True,False
4,,,forest fire near la ronge sask. canada,1,(),(),(),forest fire near la ronge sask canada,"(forest, fire, near, la, ronge, sask, canada)","((forest, fire), (fire, near), (near, la), (la...","((forest, fire, near), (fire, near, la), (near...",False,False,True,False,False,False
5,,,all residents asked to 'shelter in place' are ...,1,(),(),(),all residents asked to shelter in place are be...,"(all, residents, asked, to, shelter, in, place...","((all, residents), (residents, asked), (asked,...","((all, residents, asked), (residents, asked, t...",False,False,False,True,False,False
6,,,"13,000 people receive #wildfires evacuation or...",1,"(#wildfires,)",(),(),13000 people receive wildfires evacuation orde...,"(13000, people, receive, wildfires, evacuation...","((13000, people), (people, receive), (receive,...","((13000, people, receive), (people, receive, w...",True,False,True,True,True,False
7,,,just got sent this photo from ruby #alaska as ...,1,"(#alaska, #wildfires)",(),(),just got sent this photo from ruby alaska as s...,"(just, got, sent, this, photo, from, ruby, ala...","((just, got), (got, sent), (sent, this), (this...","((just, got, sent), (got, sent, this), (sent, ...",True,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10869,,,two giant cranes holding a bridge collapse int...,1,(),(),"(http://t.co/stfmbbzfb5,)",two giant cranes holding a bridge collapse int...,"(two, giant, cranes, holding, a, bridge, colla...","((two, giant), (giant, cranes), (cranes, holdi...","((two, giant, cranes), (giant, cranes, holding...",False,False,False,True,False,False
10870,,,@aria_ahrary @thetawniest the out of control w...,1,(),"(@aria_ahrary, @thetawniest)",(),ariaahrary thetawniest the out of control wild...,"(ariaahrary, thetawniest, the, out, of, contro...","((ariaahrary, thetawniest), (thetawniest, the)...","((ariaahrary, thetawniest, the), (thetawniest,...",False,False,True,True,False,False
10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1,(),(),"(http://t.co/zdtoyd8ebj,)",m194 0104 utc5km s of volcano hawaii httptcozd...,"(m194, 0104, utc5km, s, of, volcano, hawaii, h...","((m194, 0104), (0104, utc5km), (utc5km, s), (s...","((m194, 0104, utc5km), (0104, utc5km, s), (utc...",False,False,False,True,False,False
10872,,,police investigating after an e-bike collided ...,1,(),(),(),police investigating after an ebike collided w...,"(police, investigating, after, an, ebike, coll...","((police, investigating), (investigating, afte...","((police, investigating, after), (investigatin...",False,False,True,True,False,False


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tf_idf(df, column):
    v = TfidfVectorizer()
    x = v.fit_transform(df[column])
    return pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())

In [13]:
df = pd.concat((df.reset_index(drop=True), tf_idf(df, 'text_cleaned').reset_index(drop=True).add_suffix('_tfidf')), axis=1)

In [14]:
tf_idf_features = df[[word for word in top_tokens_ordinary + top_tokens_disaster if word in df.columns]].columns

In [15]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

y_train = df['target']
X_train = df[['has_only_disaster_hashtag', 'has_only_ordinary_hashtag'] + list(tf_idf_features)]

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_train)

In [16]:
df[['has_only_disaster_hashtag', 'has_only_ordinary_hashtag', 'text_cleaned'] + list(tf_idf_features)]

Unnamed: 0,has_only_disaster_hashtag,has_only_ordinary_hashtag,text_cleaned
0,True,False,our deeds are the reason of this earthquake ma...
1,False,False,forest fire near la ronge sask canada
2,False,False,all residents asked to shelter in place are be...
3,True,False,13000 people receive wildfires evacuation orde...
4,True,False,just got sent this photo from ruby alaska as s...
...,...,...,...
7608,False,False,two giant cranes holding a bridge collapse int...
7609,False,False,ariaahrary thetawniest the out of control wild...
7610,False,False,m194 0104 utc5km s of volcano hawaii httptcozd...
7611,False,False,police investigating after an ebike collided w...


In [17]:
df['target']

0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64

In [18]:
from sklearn.metrics import classification_report
target_names = ['ordinary', 'disaster']
print(classification_report(y_train, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    ordinary       0.59      1.00      0.74      4342
    disaster       0.94      0.06      0.12      3271

    accuracy                           0.60      7613
   macro avg       0.76      0.53      0.43      7613
weighted avg       0.74      0.60      0.47      7613

