# Real or Not? NLP with Disaster Tweets

In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

##### Data Exploration

In [2]:
train[train["target"] == 0]["text"].values[5]

'this is ridiculous....'

In [3]:
train[train["target"] == 1]["text"].values[4]

'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '

In [4]:
import seaborn as sns
sns.countplot(x= 'target',data = train)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1b273750>

In [5]:
len(train)

7613

In [6]:
 train.isnull().sum(axis=0)

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
len(test)

3263

In [8]:
test.isnull().sum(axis=0)

id             0
keyword       26
location    1105
text           0
dtype: int64

After looking at the NAs we see that the "location" has around 33% of missing values in both datasets
so in our opinion it is better to drop this column
Moreover, the keyword column has less than 1% of missing values so we will keep it in case it will be useful in the future

In [9]:
del train['location']

In [10]:
del test['location']

In [11]:
# How many http words has this text?
train.loc[train['text'].str.contains('http')].target.value_counts()

1    2172
0    1799
Name: target, dtype: int64

Removing HTTP links

In [12]:
import re
    
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def remove_links(text):
    no_link= pattern.sub('',text)
    return no_link

print(train['text'].iloc[33])
print(remove_links(train['text'].iloc[33]))

#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi
#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. 


In [13]:
train['text'] = train['text'].apply(lambda x: remove_links(x))

In [20]:
test['text'] = test['text'].apply(lambda x: remove_links(x))

Removing usernames (@)

In [23]:
pattern = re.compile('@[^\s]+')

def remove_username(text):
    no_username= pattern.sub('',text)
    return no_username

print(train['text'].iloc[65])
print(remove_links(train['text'].iloc[65]))

@nxwestmidlands huge fire at Wholesale markets ablaze 
 huge fire at Wholesale markets ablaze 


In [24]:
train['text'] = train['text'].apply(lambda x: remove_username(x))

In [25]:
test['text'] = test['text'].apply(lambda x: remove_username(x))

### The dataset has a lot of unstructured tweets which should be "cleaned" in order to make an NLP model
Removing punctuations, stop words will save more computational power  and give us a higher accuracy since they are not related to sentiments.


1- Expanding shortened words (don't to do not)

In [14]:
## Tommy, the below improting takes time becuase we are loading twitter 25 which is a specific number, 
## I google but got 2B tweetss for twitter-25 and twitter-100, so no lo se
## DO NOT REMOVE STOP WORDS, DONT IS IMPORTANT FOR NEGATION, filter?
## WHAT TO DO WITH WEBSITES

create new feature for link yes or no, deletin links, dont use keyword

In [15]:
from pycontractions import Contractions
import gensim.downloader as api

model = api.load("glove-twitter-25")
cont = Contractions(kv_model=model)
cont.load_models()


def expand_contractions(text):
    text = list(cont.expand_texts([text], precise=True))[0]
    return text

print(train['text'].iloc[7])
print(expand_contractions(train['text'].iloc[7]))

I'm on top of the hill and I can see a fire in the woods...
I am on top of the hill and I can see a fire in the woods...


In [44]:
train['text'] = train['text'].apply(expand_contractions)

In [45]:
test['text'] = test['text'].apply(expand_contractions)

2- Removal of punctuations

In [16]:
# Our dataset is related to tweets so we will have a lot of @ and # 
from textblob import TextBlob

def punctuations(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

print(train['text'].iloc[3])
print(punctuations(expand_contractions(train['text'].iloc[3])))

13,000 people receive #wildfires evacuation orders in California 
13,000 people receive wildfires evacuation orders in California


In [46]:
train['text'] = train['text'].apply(punctuations)

In [47]:
test['text'] = test['text'].apply(punctuations)

3- Removal of accented characters (café to cafe)

In [26]:
# in this dataset we do not have accented characters, this function will be used in case we are analyzing tweets 
# from France or any country that has accented characters in their languages
import unidecode

def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

print(train['text'].iloc[3])
print((remove_accented_chars(train['text'].iloc[3])))

13,000 people receive #wildfires evacuation orders in California 
13,000 people receive wildfires evacuation orders in California


In [48]:
train['text'] = train['text'].apply(remove_accented_chars)

In [49]:
test['text'] = test['text'].apply(remove_accented_chars)

4- Removal of commonly used words and stopwords

In [18]:
#Option1 

from nltk.corpus import stopwords
import re

def common_stopwords(tweet):
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    tweet = tweet.lower()
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

print(train['text'].iloc[12])
print(common_stopwords(train['text'].iloc[12]))


#raining #flooding #Florida #TampaBay #Tampa 18 or 19 days. I've lost count 
['raining', 'flooding', 'florida', 'tampabay', 'tampa', 'days', 'lost', 'count']


In [42]:
#Option2
# We decided not to import the stopwords from nltk.corpus since we wanted to keep the words that negate like no,not,,

### list of stop words that need to be removed
stop_words = ['as', 'in', 'of', 'is', 'are', 'were', 'was', 'it', 'for', 'to', 'from', 'into', 'onto', 
              'this', 'that', 'being', 'the','those', 'these', 'such', 'a', 'an']

from nltk import word_tokenize
import re

def remove_stopwords(tweet):
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    tweet = tweet.lower()
    tokenized_words = word_tokenize(tweet)
    temp = [word for word in tokenized_words if word not in stop_words]
#     temp = ' '.join(temp)
    return temp

print(train['text'].iloc[12])
print(remove_stopwords(train['text'].iloc[12]))


#raining #flooding #Florida #TampaBay #Tampa 18 or 19 days. I have lost count 
['raining', 'flooding', 'florida', 'tampabay', 'tampa', 'or', 'days', 'i', 'have', 'lost', 'count']


Choose between option A and B

choosing option B

In [50]:
train['text'] = train['text'].apply(remove_stopwords)

In [51]:
test['text'] = test['text'].apply(remove_stopwords)

In [69]:
#Create our dictionary 
uniqueWordFrequents = {}
for tweet in train.text:
    for word in tweet.split():
        if(word in uniqueWordFrequents.keys()):
            uniqueWordFrequents[word] += 1
        else:
            uniqueWordFrequents[word] = 1
            
#Convert dictionary to dataFrame
uniqueWordFrequents = pd.DataFrame.from_dict(uniqueWordFrequents,orient='index',columns=['Word Frequent'])
uniqueWordFrequents.sort_values(by=['Word Frequent'], inplace=True, ascending=False)
uniqueWordFrequents.head(10)

Unnamed: 0,Word Frequent
the,2575
a,1845
to,1805
in,1757
of,1722
and,1302
I,1197
for,820
is,814
on,773


In [70]:
uniqueWordFrequents = uniqueWordFrequents[uniqueWordFrequents['Word Frequent'] >= 20]
print(uniqueWordFrequents.shape)
uniqueWordFrequents

(720, 1)


Unnamed: 0,Word Frequent
the,2575
a,1845
to,1805
in,1757
of,1722
...,...
care,20
side,20
crisis,20
Breaking,20


In [66]:
train

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
7608,10869,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,The out of control wild fires in California ...,1
7610,10871,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii.,1
7611,10872,,Police investigating after an e-bike collided ...,1


5- Word Normalization

In [43]:
from nltk.stem.wordnet import WordNetLemmatizer

def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
print((train['text'].iloc[12]))   
print(normalization(remove_stopwords((punctuations(expand_contractions(train['text'].iloc[12]))))))

#raining #flooding #Florida #TampaBay #Tampa 18 or 19 days. I have lost count 
['rain', 'flood', 'florida', 'tampabay', 'tampa', 'or', 'days', 'i', 'have', 'lose', 'count']


In [53]:
train['text'] = train['text'].apply(normalization)

In [54]:
test['text'] = test['text'].apply(normalization)

In [31]:
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train["text"][0:5])

In [31]:
# same as previous function, testing to see if better output
from nltk.corpus import stopwords
import re

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    # split to array (default delimiter is " ")
    text = text.split()
    text = [w for w in text if not w in set(stopwords.words('english'))]
    text = ' '.join(text)
    return text

print(train['text'].iloc[12])
print(clean_text((punctuations(expand_contractions(train['text'].iloc[12])))))

#raining #flooding #Florida #TampaBay #Tampa 18 or 19 days. I've lost count 
raining flooding florida tampabay tampa days lost count


In [32]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [33]:
train_vectors = count_vectorizer.fit_transform(train["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test["text"])

In [34]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [35]:
scores = model_selection.cross_val_score(clf, train_vectors, train["target"], cv=3, scoring="f1")
scores

array([0.58280255, 0.54608523, 0.62615459])

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# pipeline = Pipeline([
#     ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
#     ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
#     ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
# ])

depending on accuracy choose option 1 or 2 in stop words and try to  remove not repetetive words