In [3]:
import nltk 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import re
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import GridSearchCV, train_test_split 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
!pip install contractions
import contractions 
import string

In [4]:
pd.set_option('display.max_colwidth',100)
# Load training set 
raw_ = pd.read_csv('text-data/train.csv')
raw_.head(30)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAf...,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [5]:
raw_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [32]:
# split data into training and test 
X_train, X_test, y_train, y_test = train_test_split(raw_[['text']],raw_['target'],random_state=42)
print(X_train.shape, X_test.shape)
X_train.head(20)

(5709, 1) (1904, 1)


Unnamed: 0,text
5151,@dicehateme @PuppyShogun This makes sense. Paper beats rock paper comes from wood so wood should...
6351,'@CatoInstitute: The causes of federal failure are deeply structural and they will not be easily...
3443,Well as I was chaning an iPad screen it fucking exploded and glass went all over the place. Look...
7164,the war on drugs has turned the U.S. into a WAR zone.
7037,Obama Declares Disaster for Typhoon-Devastated Saipan
5159,According to prophecy and also CNN a Mac tablet will completely obliterate the need for other ga...
1010,Has body bagged ** RT @d_lac: Drake is body bagging meek
5070,@ConnorFranta #AskConnor if you were a natural disaster what would you be?
2069,@soapscoop i need you to confirm that ross is dead cause i dont trust anyone else yh
931,@libraryeliza he did get a @taylorswift13 'bump' of approval which is probably why he's blown up...


In [33]:
# Exploring what the unique keywords and locations are 
non_null_kw = raw_.keyword.notnull()
non_null_loc = raw_.location.notnull()
raw_['keyword'][non_null_kw].unique()[:30]

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags'], dtype=object)

In [34]:
raw_['location'][non_null_loc].unique()[:30]

array(['Birmingham', 'Est. September 2012 - Bristol', 'AFRICA',
       'Philadelphia, PA', 'London, UK', 'Pretoria', 'World Wide!!',
       'Paranaque City', 'Live On Webcam', 'milky way',
       'GREENSBORO,NORTH CAROLINA', 'England.',
       'Sheffield Township, Ohio', 'India', 'Barbados', 'Anaheim',
       'Abuja', 'USA', 'South Africa', 'Sao Paulo, Brazil',
       'hollywoodland ', 'Edmonton, Alberta - Treaty 6',
       'Inang Pamantasan', 'Twitter Lockout in progress', 'Concord, CA',
       'Calgary, AB', 'San Francisco', 'CLVLND', 'Nashville, TN',
       'Santa Clara, CA'], dtype=object)

In [46]:
from nltk.tokenize import punkt 
from nltk import word_tokenize 
from nltk.corpus import stopwords 

# create a data cleaning function that tokenizes, 
# removes english stopwords and punctuations and returns tokenized text in lowercase 

eng_stop = stopwords.words('english') # english stopwords
wn = nltk.WordNetLemmatizer() # Instantiate word lemmatizer

def clean_text_lm(text):
    '''Removes punctuations and stopwords and returns lowercase tokenized text for input text and pattern'''
    # expand contracted sentences
    doc = contractions.fix(text) 
    # match regex pattern and replace with empty string
    doc_nospchar = re.sub(r'^a-zA-Z\s\W+',r'',doc,re.I | re.A) 
    # remove punctuations from previous out
    doc_nopunct = ''.join([char for char in doc_nospchar if char not in string.punctuation]) 
    # convert text to lower case and strip white space if any
    doc_lower_nospc = doc_nopunct.lower().strip() 
    # lemmatize and store in list format
    lem_text = [wn.lemmatize(word) for word in re.split('\W+',doc_lower_nospc)] 
     # join list into string with no stopwords
    no_stop_docs = ' '.join([word for word in lem_text if word not in eng_stop])
    
    return no_stop_docs
    
    

In [48]:
# Vectorize the function to apply accross dataframe
cleaner = np.vectorize(clean_text_lm)
# store values in separate column in df
X_train['cleaned_text'] = cleaner(X_train[['text']])
X_train.head(10)


Unnamed: 0,text,cleaned_text
5151,@dicehateme @PuppyShogun This makes sense. Paper beats rock paper comes from wood so wood should...,dicehateme puppyshogun make sense paper beat rock paper come wood wood able support obliterate rock
6351,'@CatoInstitute: The causes of federal failure are deeply structural and they will not be easily...,catoinstitute cause federal failure deeply structural easily solved httptcoh2xcax4jbu
3443,Well as I was chaning an iPad screen it fucking exploded and glass went all over the place. Look...,well wa chaning ipad screen fucking exploded glass went place look like job going need new one
7164,the war on drugs has turned the U.S. into a WAR zone.,war drug ha turned yous war zone
7037,Obama Declares Disaster for Typhoon-Devastated Saipan,obama declares disaster typhoondevastated saipan
5159,According to prophecy and also CNN a Mac tablet will completely obliterate the need for other ga...,according prophecy also cnn mac tablet completely obliterate need gadget combining û httptcoxfcc...
1010,Has body bagged ** RT @d_lac: Drake is body bagging meek,ha body bagged rt dlac drake body bagging meek
5070,@ConnorFranta #AskConnor if you were a natural disaster what would you be?,connorfranta askconnor natural disaster would
2069,@soapscoop i need you to confirm that ross is dead cause i dont trust anyone else yh,soapscoop need confirm ross dead trust anyone else yh
931,@libraryeliza he did get a @taylorswift13 'bump' of approval which is probably why he's blown up...,libraryeliza get taylorswift13 bump approval probably blown httptcokolmzbz1pz musicadvisory


The cleaned text is now devoid of any special characters or stopwords, however its still not ready to be vectorized. The first action required is to tokenize the words, ie, converting the sentence into a list of words, and then, there can be many words that have a similar meaning such search, searching, searched, etc. I used a lemmatizer (WordNetLemmatizer) to correlate words with similar meaning and keeps the root words. 

In [51]:
# transform test set to vectorized format
X_test['cleaned_text'] = cleaner(X_test[['text']])
X_test.head()

Unnamed: 0,text,cleaned_text
2644,So you have a new weapon that can cause un-imaginable destruction.,new weapon unimaginable destruction
2227,The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons...,famping thing gishwhes got soaked deluge going pad tampon thx mishacollins
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe CoL police can catch a pickpocket in Liverpool St...,dt georgegalloway rt galloway4mayor ûïthe col police catch pickpocket liverpool stree httptcovxi...
132,Aftershock back to school kick off was great. I want to thank everyone for making it possible. W...,aftershock back school kick wa great want thank everyone making possible great night
6845,in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerab...,response trauma child addict develop defensive self one decrease vulnerability 3


In [52]:
# Instantiate CountVectorizer
CountVec = CountVectorizer(analyzer = 'word',ngram_range = (1,1))
# convert text to matrix of token counts
train_cmatrix = CountVec.fit_transform(X_train['cleaned_text']).toarray()
# convert count matrix to dataframe
train_cmatrix_df = pd.DataFrame(count_matrix_arr, columns = CountVec.get_feature_names() )

# performing similar steps on test data
test_cmatrix = CountVec.transform(X_test['cleaned_text']).toarray()
test_cmatrix_df = pd.DataFrame(test_cmatrix,columns = CountVec.get_feature_names())


In [59]:
# Instantiate classifier
rf_clf = RandomForestClassifier(n_jobs=-1)
# fit the model 
rf_model = rf_clf.fit(train_cmatrix, y_train)
# predict on test 
y_hat = rf_model.predict(test_cmatrix)
# evaluate model 
print(classification_report(y_test, y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, y_hat),columns=['P','N'],index=['P','N'])
print(confusion_)

              precision    recall  f1-score   support

           0       0.77      0.90      0.83      1091
           1       0.83      0.63      0.72       813

    accuracy                           0.79      1904
   macro avg       0.80      0.77      0.77      1904
weighted avg       0.79      0.79      0.78      1904

     P    N
P  982  109
N  297  516


In [60]:
# Find best params
params = {'criterion':['gini','entropy'],'n_estimators':[50,100,150], 'max_depth':[20,50,75,100]}
rf = RandomForestClassifier()
gs = GridSearchCV(rf ,param_grid = params ,cv = 5, n_jobs=-1)
gs.fit(train_cmatrix,y_train)
grid_pred = gs.predict(test_cmatrix)
best_params = gs.best_params_
print("Best params: {}\n Best Score: {}".format(best_params,gs.best_score_))
print(classification_report(y_test,grid_pred))


Best params: {'criterion': 'gini', 'max_depth': 100, 'n_estimators': 150}
 Best Score: 0.7694873916173327
              precision    recall  f1-score   support

           0       0.74      0.95      0.83      1091
           1       0.89      0.55      0.68       813

    accuracy                           0.78      1904
   macro avg       0.82      0.75      0.76      1904
weighted avg       0.80      0.78      0.77      1904



In [61]:

rf_clf = RandomForestClassifier(max_depth=100, n_estimators= 150, n_jobs=-1)
# fit the model 
rf_model = rf_clf.fit(train_cmatrix, y_train)
# predict on test 
y_hat = rf_model.predict(test_cmatrix)
# evaluate model 
print(classification_report(y_test, y_hat))
confusion_ = pd.DataFrame(confusion_matrix(y_test, y_hat),columns=['P','N'],index=['P','N'])
print(confusion_)

              precision    recall  f1-score   support

           0       0.74      0.95      0.83      1091
           1       0.88      0.54      0.67       813

    accuracy                           0.77      1904
   macro avg       0.81      0.74      0.75      1904
weighted avg       0.80      0.77      0.76      1904

      P    N
P  1031   60
N   370  443


#### Using TF-IDF vectorizer and XGB classifier 


In [None]:
vect = TfidfVectorizer()
gb = GradientBoostingClassifier()