# Real or Not? NLP with Disaster Tweets

In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

##### Data Exploration

In [2]:
train[train["target"] == 0]["text"].values[5]

'this is ridiculous....'

In [3]:
train[train["target"] == 1]["text"].values[4]

'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '

In [4]:
import seaborn as sns
sns.countplot(x= 'target',data = train)

<matplotlib.axes._subplots.AxesSubplot at 0x1a16ded690>

In [5]:
len(train)

7613

In [6]:
 train.isnull().sum(axis=0)

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
len(test)

3263

In [8]:
test.isnull().sum(axis=0)

id             0
keyword       26
location    1105
text           0
dtype: int64

After looking at the NAs we see that the "location" has around 33% of missing values in both datasets
so in our opinion it is better to drop this column
Moreover, the keyword column has less than 1% of missing values so we will keep it in case it will be useful in the future

In [9]:
del train['location']

In [10]:
del test['location']

### The dataset has a lot of unstructured tweets which should be "cleaned" in order to make an NLP model
Removing punctuations, stop words will save more computational power  and give us a higher accuracy since they are not related to sentiments.


1- Removing HTTP links

In [11]:
# How many http words has this text?
train.loc[train['text'].str.contains('http')].target.value_counts()

1    2172
0    1799
Name: target, dtype: int64

Creating a new column to show if the tweet has a link (maybe retweeted or a normal link)

In [12]:
train['link'] = 0 
train.link.loc[train['text'].str.contains('http')] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [13]:
import re
    
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def remove_links(text):
    no_link= pattern.sub('',text)
    return no_link

print(train['text'].iloc[33])
print(remove_links(train['text'].iloc[33]))

#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi
#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. 


In [14]:
train['text'] = train['text'].apply(lambda x: remove_links(x))

In [15]:
test['text'] = test['text'].apply(lambda x: remove_links(x))

2- We realized we have a lot of retweets, so after removing the https: which means a retweet, we will remove all duplicates

In [16]:
train.groupby(['text']).size().reset_index(name='CountofRetweets')

Unnamed: 0,text,CountofRetweets
0,\nANOTHER DISASTER WAITING TO HAPPEN AND YOUR ...,1
1,\nCROYDON RIOTS- The Next Day: Burning Buildin...,1
2,\nSON OF SAVIOR LAVA VIDEO,1
3,\nSeems they declared war against government..,1
4,Cindy Noonan@CindyNoonan-Heartbreak in #Balt...,2
...,...,...
6984,Û÷We Can HelpÛª Says Denver Firefighter Work...,1
6985,å_? New Ladies Shoulder Tote #Handbag Faux Lea...,1
6986,å¤} New Ladies Shoulder Tote #Handbag Faux Lea...,1
6987,å¬'Only the sea knows how many are dead' @MSF_...,1


In [17]:
train.drop_duplicates(subset='text', keep="first")

Unnamed: 0,id,keyword,text,target,link
0,1,,Our Deeds are the Reason of this #earthquake M...,1,0
1,4,,Forest fire near La Ronge Sask. Canada,1,0
2,5,,All residents asked to 'shelter in place' are ...,1,0
3,6,,"13,000 people receive #wildfires evacuation or...",1,0
4,7,,Just got sent this photo from Ruby #Alaska as ...,1,0
...,...,...,...,...,...
7602,10860,,a siren just went off and it wasn't the Forney...,1,0
7603,10862,,Officials say a quarantine is in place at an A...,1,1
7604,10863,,#WorldNews Fallen powerlines on G:link tram: U...,1,1
7605,10864,,on the flip side I'm at Walmart and there is a...,1,0


2- Removing usernames (@)

In [18]:
pattern = re.compile('@[^\s]+')

def remove_username(text):
    no_username= pattern.sub('',text)
    return no_username

print(train['text'].iloc[65])
print(remove_links(train['text'].iloc[65]))

@nxwestmidlands huge fire at Wholesale markets ablaze 
 huge fire at Wholesale markets ablaze 


In [19]:
train['text'] = train['text'].apply(lambda x: remove_username(x))

In [20]:
test['text'] = test['text'].apply(lambda x: remove_username(x))

3- Expanding shortened words (don't to do not)

In [21]:
## DO NOT REMOVE STOP WORDS, DONT IS IMPORTANT FOR NEGATION, filter?
## WHAT TO DO WITH WEBSITES, create new column yes and no?

In [22]:
from pycontractions import Contractions
import gensim.downloader as api

model = api.load("glove-twitter-25")
cont = Contractions(kv_model=model)
cont.load_models()


def expand_contractions(text):
    text = list(cont.expand_texts([text], precise=True))[0]
    return text

print(train['text'].iloc[7])
print(expand_contractions(train['text'].iloc[7]))

I'm on top of the hill and I can see a fire in the woods...
I am on top of the hill and I can see a fire in the woods...


In [23]:
train['text'] = train['text'].apply(expand_contractions)

In [24]:
test['text'] = test['text'].apply(expand_contractions)

4- Removal of punctuations

In [25]:
# Our dataset is related to tweets so we will have a lot of @ and # 
from textblob import TextBlob

def punctuations(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

print(train['text'].iloc[3])
print(punctuations(expand_contractions(train['text'].iloc[3])))

13,000 people receive #wildfires evacuation orders in California 
13,000 people receive wildfires evacuation orders in California


In [26]:
train['text'] = train['text'].apply(punctuations)

In [27]:
test['text'] = test['text'].apply(punctuations)

5- Removal of accented characters (café to cafe)

In [28]:
# in this dataset we do not have accented characters, this function will be used in case we are analyzing tweets 
# from France or any country that has accented characters in their languages
import unidecode

def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

# print(train['text'].iloc[3])
# print((remove_accented_chars(train['text'].iloc[3])))

In [29]:
train['text'] = train['text'].apply(remove_accented_chars)

In [30]:
test['text'] = test['text'].apply(remove_accented_chars)

In [31]:
def repeatedletters(string):
    postCheck = ""
    previousChar = None
    for currentChar in string:
        if currentChar != previousChar:
            postCheck += currentChar
            previousChar = currentChar
    return postCheck.title()

print((train['text'].iloc[2]))   
print(repeatedletters((train['text'].iloc[2])))

All residents asked to 'shelter in place are being notified by officers No other evacuation or shelter in place orders are expected
Al Residents Asked To 'Shelter In Place Are Being Notified By Oficers No Other Evacuation Or Shelter In Place Orders Are Expected


In [32]:
train['text'] = train['text'].apply(repeatedletters)

6- Removal of commonly used words and stopwords

In [33]:
#Option1 

from nltk.corpus import stopwords
import re

def common_stopwords(tweet):
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    tweet = tweet.lower()
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
#     clean_mess = ' '.join(clean_mess)
    return clean_mess

print(train['text'].iloc[12])
print(common_stopwords(train['text'].iloc[12]))


Raining Floding Florida Tampabay Tampa 18 Or 19 Days I Have Lost Count
['raining', 'floding', 'florida', 'tampabay', 'tampa', 'days', 'lost', 'count']


In [34]:
#Option2 
# We decided not to import the stopwords from nltk.corpus since we wanted to keep the words that negate like no,not,,

# ### list of stop words that need to be removed
# stop_words = ['as', 'in', 'of', 'is', 'are', 'were', 'was', 'it', 'for', 'to', 'from', 'into', 'onto', 
#               'this', 'that', 'being', 'the','those', 'these', 'such', 'a', 'an','i','and','be','you',
#               'have','on','my','do','with', 'or','be','at','by','s','have']

# from nltk import word_tokenize
# import re

# def remove_stopwords(tweet):
#     tweet = re.sub('[^a-zA-Z]', ' ', tweet)
#     tweet = tweet.lower()
#     tokenized_words = word_tokenize(tweet)
#     temp = [word for word in tokenized_words if word not in stop_words]
# #     temp = ' '.join(temp)
#     return temp

# print(train['text'].iloc[12])
# print(remove_stopwords(train['text'].iloc[12]))


Choose between option A and B, we got a lower score using option B where we filter manual the stop words, so we will use option A

In [35]:
train['text'] = train['text'].apply(common_stopwords)

In [36]:
test['text'] = test['text'].apply(common_stopwords)

7- Word Normalization

In [37]:
from nltk.stem.wordnet import WordNetLemmatizer

def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
print((train['text'].iloc[12]))   
print(normalization((train['text'].iloc[12])))

['raining', 'floding', 'florida', 'tampabay', 'tampa', 'days', 'lost', 'count']
['rain', 'floding', 'florida', 'tampabay', 'tampa', 'days', 'lose', 'count']


In [38]:
train['text'] = train['text'].apply(normalization)

In [39]:
test['text'] = test['text'].apply(normalization)

In [40]:
fff

NameError: name 'fff' is not defined

8- Removing repeated letters

In [None]:
def repeatedletters(string):
    postCheck = ""
    previousChar = None
    for currentChar in string:
        if currentChar != previousChar:
            postCheck += currentChar
            previousChar = currentChar
    return postCheck.title()

print((train['text'].iloc[24]))   
print(repeatedletters((train['text'].iloc[24])))


In [None]:
train['text'] = train['text'].apply(repeatedletters)

In [None]:
test['text'] = test['text'].apply(repeatedletters)

9- Creating a new column with word count

In [None]:
train['WordCount'] = train['text'].str.count(' ') + 1

In [None]:
train['text'][2]

In [None]:
train['WordCount'][2]

In [None]:
# for i in range(0,len(train)):
#         train['text'][i] = ' '.join(train['text'][i])

In [None]:
#Create our dictionary 
uniqueWordFrequents = {}
for tweet in train.text:
    for word in tweet.split():
        if(word in uniqueWordFrequents.keys()):
            uniqueWordFrequents[word] += 1
        else:
            uniqueWordFrequents[word] = 1
            
#Convert dictionary to dataFrame
uniqueWordFrequents = pd.DataFrame.from_dict(uniqueWordFrequents,orient='index',columns=['Word Frequent'])
uniqueWordFrequents.sort_values(by=['Word Frequent'], inplace=True, ascending=False)
uniqueWordFrequents.head(10)



In [None]:
uniqueWordFrequents = uniqueWordFrequents[uniqueWordFrequents['Word Frequent'] >= 20]
print(uniqueWordFrequents.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

counVec = CountVectorizer(max_features = uniqueWordFrequents.shape[0])
bagOfWords = counVec.fit_transform(train.text).toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  

tfidfconverter = TfidfVectorizer(max_features=uniqueWordFrequents.shape[0], min_df=5, max_df=0.7)  
X = tfidfconverter.fit_transform(train.text).toarray()
#if remove X, bagofwords should be X

In [None]:
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.naive_bayes import MultinomialNB

# text_clf = Pipeline([('vect', CountVectorizer(max_features = uniqueWordFrequents.shape[0], min_df=5, max_df=0.7 )),
#                       ('tfidf', TfidfTransformer()),
#                       ('clf', MultinomialNB()) ])
# text_clf = text_clf.fit(X_train,y_train)

In [None]:
# import numpy as np
# predicted = text_clf.predict(X_test)
# np.mean(predicted == y_test)

In [None]:
# y_pred = text_clf.predict(X_test)
# print(' F1 Score is      : ' ,f1_score(y_test,y_pred))

In [None]:
from sklearn.model_selection import train_test_split


X = train.text
y = train['target']
print("X shape = ",X.shape)
print("y shape = ",y.shape)

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.20, random_state=55, shuffle =True)
print('data splitting successfully')

In [None]:
from sklearn.tree import DecisionTreeClassifier

decisionTreeModel = DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = None, 
                                           splitter='best', 
                                           random_state=55)

decisionTreeModel.fit(X_train,y_train)

print("decision Tree Classifier model run successfully")

In [None]:
print(' Train Score is   : ' ,decisionTreeModel.score(X_train, y_train))

In [None]:
from sklearn.metrics import f1_score


y_pred = decisionTreeModel.predict(X_test)
print(' F1 Score is      : ' ,f1_score(y_test,y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression

LogisticRegression = LogisticRegression(penalty='l2', 
                                        solver='saga', 
                                        random_state = 55)  

LogisticRegression.fit(X_train,y_train)

print("LogisticRegression Classifier model run successfully")

In [None]:
print(' Train Score is   : ' ,LogisticRegression.score(X_train, y_train))

In [None]:
y_pred = LogisticRegression.predict(X_test)
print(' F1 Score is      : ' ,f1_score(y_test,y_pred))

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# pipeline = Pipeline([
#     ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
#     ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
#     ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
# ])

depending on accuracy choose option 1 or 2 in stop words and try to  remove not repetetive words