In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

##### Data Exploration

In [2]:
train[train["target"] == 0]["text"].values[5]

'this is ridiculous....'

In [3]:
train[train["target"] == 1]["text"].values[4]

'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school '

In [4]:
import seaborn as sns

sns.countplot(x= 'target',data = train)

<matplotlib.axes._subplots.AxesSubplot at 0x1a1b6d5a90>

In [5]:
len(train)

7613

In [6]:
 train.isnull().sum(axis=0)

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
len(test)

3263

In [8]:
test.isnull().sum(axis=0)

id             0
keyword       26
location    1105
text           0
dtype: int64

After looking at the NAs we see that the "location" has around 33% of missing values in both datasets
so in our opinion it is better to drop this column
Moreover, the keyword column has less than 1% of missing values so we will keep it in case it will be useful in the future

In [9]:
del train['location']

In [10]:
del test['location']

#### Creating new variables

In [11]:
# punctuation_count
import string
train['punctuation_count'] = train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test['punctuation_count'] = test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# stop_word_count
from wordcloud import STOPWORDS
train['stop_word_count'] = train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
test['stop_word_count'] = test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# char_count
train['char_count'] = train['text'].apply(lambda x: len(str(x)))
test['char_count'] = test['text'].apply(lambda x: len(str(x)))

# word_count
train['word_count'] = train['text'].apply(lambda x: len(str(x).split()))
test['word_count'] = test['text'].apply(lambda x: len(str(x).split()))

# unique_word_count
train['unique_word_count'] = train['text'].apply(lambda x: len(set(str(x).split())))
test['unique_word_count'] = test['text'].apply(lambda x: len(set(str(x).split())))


# mean_word_length
import numpy as np
train['mean_word_length'] = train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test['mean_word_length'] = test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))



### The dataset has a lot of unstructured tweets which should be "cleaned" in order to make an NLP model
Removing punctuations, stop words will save more computational power  and give us a higher accuracy since they are not related to sentiments.


1- Removing Duplicates/Fixing the mislabled targets

We realized we have a lot of retweets, so after removing the https: which means a retweet. We tried removing the duplicates but we got a score lower by 0.02, so we will just fix the mislabled targets by creating a dataframe that has the mislabled

In [12]:
df_mislabeled = train.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled.index.tolist()

['like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit',
 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!',
 "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'",
 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!',
 'To fight bioterrorism sir.',
 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE',
 '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption',
 '#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect',
 'He came to a land which was engulfed in tribal war and turned it into a land 

In [13]:
#Refixing the target variable of the 18 mislabled texts 
train['target_relabeled'] = train['target'].copy() 

train.loc[train['text'] == 'like for the music video I want some real action shit like burning buildings and \
                                police chases not some weak ben winston shit', 'target_relabeled'] = 0
train.loc[train['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control\
                                you! #Afterlife', 'target_relabeled'] = 0
train.loc[train['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
train.loc[train['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally \
                                displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
train.loc[train['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver \
                                Spring', 'target_relabeled'] = 1
train.loc[train['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe\
                                and hazardous for #humanconsumption', 'target_relabeled'] = 0
train.loc[train['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the \
                                opposite of terrorism!', 'target_relabeled'] = 0
train.loc[train['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God\
                                is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
train.loc[train['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper.\
                                http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
train.loc[train['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building....\
                                    Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
train.loc[train['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] \
                                    = 0
train.loc[train['text'] == "He came to a land which was engulfed in tribal war and turned it into a land\
                              of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
train.loc[train['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do \
                             anything that leads to it #islam!", 'target_relabeled'] = 0
train.loc[train['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is \
                                                    by giving half a date in charity.'", 'target_relabeled'] = 0
train.loc[train['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
train.loc[train['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????",\
                        'target_relabeled'] = 0
train.loc[train['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description\
                            of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
train.loc[train['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while\
                            and you realise its been on 3G this whole time", 'target_relabeled'] = 0

2- Removing HTTP links

In [14]:
# How many http words has this text?
train.loc[train['text'].str.contains('http')].target.value_counts()

1    2172
0    1799
Name: target, dtype: int64

In [15]:
import re
    
pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
def remove_links(text):
    no_link= pattern.sub('',text)
    return no_link

print("Example: ")
print(train['text'].iloc[33])
print(remove_links(train['text'].iloc[33]))

Example: 
#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. http://t.co/2nndBGwyEi
#AFRICANBAZE: Breaking news:Nigeria flag set ablaze in Aba. 


In [16]:
train['text'] = train['text'].apply(lambda x: remove_links(x))
test['text'] = test['text'].apply(lambda x: remove_links(x))

4- Removing usernames (@)

In [17]:
pattern = re.compile('@[^\s]+')

def remove_username(text):
    no_username= pattern.sub('',text)
    return no_username
print("Example: ")
print(train['text'].iloc[65])
print(remove_links(train['text'].iloc[65]))

Example: 
@nxwestmidlands huge fire at Wholesale markets ablaze 
 huge fire at Wholesale markets ablaze 


In [18]:
train['text'] = train['text'].apply(lambda x: remove_username(x))
test['text'] = test['text'].apply(lambda x: remove_username(x))

5- Expanding shortened words (don't to do not)

In [19]:
from pycontractions import Contractions
import gensim.downloader as api

model = api.load("glove-twitter-25")
cont = Contractions(kv_model=model)
cont.load_models()


def expand_contractions(text):
    text = list(cont.expand_texts([text], precise=True))[0]
    return text

print("Example: ")
print(train['text'].iloc[7])
print(expand_contractions(train['text'].iloc[7]))

Example: 
I'm on top of the hill and I can see a fire in the woods...
I am on top of the hill and I can see a fire in the woods...


In [20]:
train['text'] = train['text'].apply(expand_contractions)
test['text'] = test['text'].apply(expand_contractions)

6- Removal of punctuations

In [21]:
# Our dataset is related to tweets so we will have a lot of @ and # 
from textblob import TextBlob

def punctuations(tweet):
    tweet_blob = TextBlob(tweet)
    return ' '.join(tweet_blob.words)

print("Example: ")
print(train['text'].iloc[3])
print(punctuations(expand_contractions(train['text'].iloc[3])))

Example: 
13,000 people receive #wildfires evacuation orders in California 
13,000 people receive wildfires evacuation orders in California


In [22]:
train['text'] = train['text'].apply(punctuations)
test['text'] = test['text'].apply(punctuations)

7- Removal of accented characters (café to cafe)

In [23]:
# in this dataset we do not have accented characters, this function will be used in case we are analyzing tweets 
# from France or any country that has accented characters in their languages
import unidecode

def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

# print(train['text'].iloc[3])
# print((remove_accented_chars(train['text'].iloc[3])))

In [24]:
train['text'] = train['text'].apply(remove_accented_chars)
test['text'] = test['text'].apply(remove_accented_chars)

8- Removal of repeated letters

In [25]:
def remove_repeated(txt):
    cleaned = re.sub(r'(.)\1+', r'\1\1', txt)
    return cleaned

print("Example: ")
print(train['text'].iloc[28])
print((remove_repeated(train['text'].iloc[28])))

Example: 
Cooool
Cool


In [26]:
train['text'] = train['text'].apply(remove_repeated)
test['text'] = test['text'].apply(remove_repeated)

9- Removal of Emojis

In [27]:
def remove_emoji(txt):
    emoji_pattern = re.compile("[" u"\U000024C2-\U0001F251"
                                       u"\U00002702-\U000027B0"
                                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                                                       u"\U0001F600-\U0001F64F"  # emoticons
                                                                                               "]+", flags = re.UNICODE)
    
    return emoji_pattern.sub(r'', txt)

train['text'] = train['text'].apply(remove_emoji)
test['text'] = test['text'].apply(remove_emoji)

9- Removal of commonly used words and stopwords

In [28]:
#Option1 

from nltk.corpus import stopwords
import re

def common_stopwords(tweet):
    tweet = re.sub('[^a-zA-Z]', ' ', tweet)
    tweet = tweet.lower()
    tweet_list = [ele for ele in tweet.split() if ele != 'user']
    clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
#     clean_mess = ' '.join(clean_mess)
    return clean_mess

print(train['text'].iloc[12])
print(common_stopwords(train['text'].iloc[12]))


raining flooding Florida TampaBay Tampa 18 or 19 days I have lost count
['raining', 'flooding', 'florida', 'tampabay', 'tampa', 'days', 'lost', 'count']


In [29]:
#Option2 
# We decided not to import the stopwords from nltk.corpus since we wanted to keep the words that negate like no,not,,

# ### list of stop words that need to be removed
# stop_words = ['as', 'in', 'of', 'is', 'are', 'were', 'was', 'it', 'for', 'to', 'from', 'into', 'onto', 
#               'this', 'that', 'being', 'the','those', 'these', 'such', 'a', 'an','i','and','be','you',
#               'have','on','my','do','with', 'or','be','at','by','s','have']

# from nltk import word_tokenize
# import re

# def remove_stopwords(tweet):
#     tweet = re.sub('[^a-zA-Z]', ' ', tweet)
#     tweet = tweet.lower()
#     tokenized_words = word_tokenize(tweet)
#     temp = [word for word in tokenized_words if word not in stop_words]
# #     temp = ' '.join(temp)
#     return temp

# print(train['text'].iloc[12])
# print(remove_stopwords(train['text'].iloc[12]))


Choose between option A and B, we got a lower score using option B where we filter manually the stop words, so we will use option A

In [30]:
train['text'] = train['text'].apply(common_stopwords)
test['text'] = test['text'].apply(common_stopwords)

10- Word Normalization

In [32]:
from nltk.stem.wordnet import WordNetLemmatizer

def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return normalized_tweet
    
print((train['text'].iloc[12]))   
print(normalization((train['text'].iloc[12])))

['raining', 'flooding', 'florida', 'tampabay', 'tampa', 'days', 'lost', 'count']
['rain', 'flood', 'florida', 'tampabay', 'tampa', 'days', 'lose', 'count']


In [33]:
train['text'] = train['text'].apply(normalization)
test['text'] = test['text'].apply(normalization)

In [34]:
afor i in range(0,len(train)):
        train.text.iloc[i] = ' '.join(train.text.iloc[i])

SyntaxError: invalid syntax (<ipython-input-34-e631aafd30d4>, line 1)

In [None]:
sss

In [None]:
 train['text']

In [None]:
#Create our dictionary 
uniqueWordFrequents = {}
for tweet in train.text:
    for word in tweet.split():
        if(word in uniqueWordFrequents.keys()):
            uniqueWordFrequents[word] += 1
        else:
            uniqueWordFrequents[word] = 1
            
#Convert dictionary to dataFrame
uniqueWordFrequents = pd.DataFrame.from_dict(uniqueWordFrequents,orient='index',columns=['Word Frequent'])
uniqueWordFrequents.sort_values(by=['Word Frequent'], inplace=True, ascending=False)
uniqueWordFrequents.head(10)

In [None]:
uniqueWordFrequents = uniqueWordFrequents[uniqueWordFrequents['Word Frequent'] >= 20]
print(uniqueWordFrequents.shape)

Normalizing the features that were created

In [None]:
train['norm_count_word']=(train.WordCount-train.WordCount.min())/(train.WordCount.max()-train.WordCount.min())
train.norm_count_word

In [None]:
train['norm_count_retweets']=(train.CountofRetweets-train.CountofRetweets.min())/(train.CountofRetweets.max()-train.CountofRetweets.min())
train.norm_count_retweets

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

# counVec = CountVectorizer(max_features = uniqueWordFrequents.shape[0])
# bagOfWords = counVec.fit_transform(train.text).toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  

tfidfconverter = TfidfVectorizer(max_features=uniqueWordFrequents.shape[0], min_df=5, max_df=0.7)  
X1 = tfidfconverter.fit_transform(train.text).toarray()
#if remove X, bagofwords should be X


Adding normalized variables

In [None]:
train.norm_count_retweets

In [None]:
# import numpy as np
# import pandas as pd

# X = pd.DataFrame(X1)

# new_X = pd.concat([X, train.norm_count_retweets], axis = 1)
# new_X = pd.concat([new_X, train.norm_count_word], axis = 1)
# new_X = pd.concat([new_X, train.link], axis = 1)

In [None]:
new_X

In [None]:
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.naive_bayes import MultinomialNB

# text_clf = Pipeline([('vect', CountVectorizer(max_features = uniqueWordFrequents.shape[0], min_df=5, max_df=0.7 )),
#                       ('tfidf', TfidfTransformer()),
#                       ('clf', MultinomialNB()) ])
# text_clf = text_clf.fit(X_train,y_train)

In [None]:
# import numpy as np
# predicted = text_clf.predict(X_test)
# np.mean(predicted == y_test)

In [None]:
# y_pred = text_clf.predict(X_test)
# print(' F1 Score is      : ' ,f1_score(y_test,y_pred))

In [None]:
from sklearn.model_selection import train_test_split

y = train['target']
print("X shape = ",X1.shape)
print("y shape = ",y.shape)

X_train , X_test , y_train , y_test = train_test_split(X1,y,test_size=0.20, random_state=55, shuffle =True)
print('data splitting successfully')

1- Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

decisionTreeModel = DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = None, 
                                           splitter='best', 
                                           random_state=55)

decisionTreeModel.fit(X_train,y_train)

print("decision Tree Classifier model run successfully")

2- Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LogisticRegression = LogisticRegression(penalty='l2', 
                                        solver='saga', 
                                        random_state = 55)  

LogisticRegression.fit(X_train,y_train)

print("LogisticRegression Classifier model run successfully")

3- Support Vector Machine Model

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

SVClassifier = SVC(random_state = 55, tol =1e-3)

SVClassifier.fit(X_train,y_train)

print("SVClassifier model run successfully")

4- Gradient Boosting Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradientBoostingModel = GradientBoostingClassifier(loss = 'deviance',
                                                   learning_rate = 0.01,
                                                   n_estimators = 100,
                                                   max_depth = 30,
                                                   random_state=55)

gradientBoostingModel.fit(X_train,y_train)

print("gradient Boosting Classifier model run successfully")

5- Multinomial Naive Bayes Model

In [None]:
from sklearn.naive_bayes import MultinomialNB

multinomialNBModel = MultinomialNB(alpha=0.1)
multinomialNBModel.fit(X_train,y_train)

print("multinomialNB model run successfully")

In [None]:
from sklearn.metrics import f1_score

#evaluation Details
models = [decisionTreeModel, gradientBoostingModel,  LogisticRegression,
          SVClassifier, multinomialNBModel]

for model in models:
    print(type(model).__name__,' Train Score is   : ' ,model.score(X_train, y_train))
    print(type(model).__name__,' Test Score is    : ' ,model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
    print(type(model).__name__,' F1 Score is      : ' ,f1_score(y_test,y_pred))
    print('--------------------------------------------------------------------------')

In [37]:
train


Unnamed: 0,id,keyword,text,target,punctuation_count,stop_word_count,char_count,word_count,unique_word_count,mean_word_length,target_relabeled
0,1,,"[deeds, reason, earthquake, may, allah, forgiv...",1,1,6,69,13,13,4.384615,1
1,4,,"[forest, fire, near, la, ronge, sask, canada]",1,1,0,38,7,7,4.571429,1
2,5,,"[residents, ask, shelter, place, notify, offic...",1,3,11,133,22,20,5.090909,1
3,6,,"[people, receive, wildfires, evacuation, order...",1,2,1,65,8,8,7.125000,1
4,7,,"[get, send, photo, ruby, alaska, smoke, wildfi...",1,2,7,88,16,15,4.500000,1
...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,,"[two, giant, crane, hold, bridge, collapse, ne...",1,5,2,83,11,11,6.636364,1
7609,10870,,"[control, wild, fire, california, even, northe...",1,5,9,125,20,17,5.300000,1
7610,10871,,"[utc, km, volcano, hawaii]",1,11,1,65,8,8,7.250000,1
7611,10872,,"[police, investigate, e, bike, collide, car, l...",1,5,5,137,19,19,6.263158,1


In [35]:
X = train['text']
y = train['target']

final_test = test['text']

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer  

ug_vectorizer = TfidfVectorizer()
X_vec = ug_vectorizer.fit_transform(X)
final_test_vec = ug_vectorizer.transform(final_test)

AttributeError: 'list' object has no attribute 'lower'

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, random_state=0)

In [None]:
def print_report(exp, pred):
    print(pd.crosstab(exp, pred, rownames=['Actual'], colnames=['Predicted']))
    print('\n \n')
    print(classification_report(exp, pred))

In [None]:
from sklearn.naive_bayes import MultinomialNB

naive_clf = MultinomialNB(alpha=1).fit(X_train, y_train)
naive_clf.score(X_test, y_test)

In [None]:
naive_predicted = naive_clf.predict(X_test)
print_report(y_test, naive_predicted)

In [None]:
from sklearn.linear_model import SGDClassifier

svm_sgd_clf = linear_model.SGDClassifier().fit(X_train, y_train)
svm_sgd_clf.score(X_test, y_test)

In [None]:
svm_sgd_predicted = svm_sgd_clf.predict(X_test)
print_report(y_test, svm_sgd_predicted)

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# pipeline = Pipeline([
#     ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
#     ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
#     ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
# ])

depending on accuracy choose option 1 or 2 in stop words and try to  remove not repetetive words