In [1]:
import nltk 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import re
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import string

In [2]:
pd.set_option('display.max_colwidth',100)
# Load training set 
train = pd.read_csv('text-data/train.csv')
train.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAf...,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
# Load test dataset
X_test = pd.read_csv('text-data/test.csv')
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [5]:
test_labels = pd.read_csv('text-data/sample_submission.csv')
test_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB


In [6]:
# Exploring what the unique keywords and locations are 
non_null_kw = train.keyword.notnull()
non_null_loc = train.location.notnull()
train['keyword'][non_null_kw].unique()[:30]

array(['ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags'], dtype=object)

In [7]:
train['location'][non_null_loc].unique()[:30]

array(['Birmingham', 'Est. September 2012 - Bristol', 'AFRICA',
       'Philadelphia, PA', 'London, UK', 'Pretoria', 'World Wide!!',
       'Paranaque City', 'Live On Webcam', 'milky way',
       'GREENSBORO,NORTH CAROLINA', 'England.',
       'Sheffield Township, Ohio', 'India', 'Barbados', 'Anaheim',
       'Abuja', 'USA', 'South Africa', 'Sao Paulo, Brazil',
       'hollywoodland ', 'Edmonton, Alberta - Treaty 6',
       'Inang Pamantasan', 'Twitter Lockout in progress', 'Concord, CA',
       'Calgary, AB', 'San Francisco', 'CLVLND', 'Nashville, TN',
       'Santa Clara, CA'], dtype=object)

In [8]:
corpus_df = train[['text','target']]
corpus_df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1
3,"13,000 people receive #wildfires evacuation orders in California",1
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [9]:
from nltk.tokenize import punkt # no punctuations 
from nltk import word_tokenize 
from nltk.corpus import stopwords 
# create a data cleaning function that tokenizes, 
# removes english stopwords and punctuations and returns tokenized text in lowercase 

eng_stop = stopwords.words('english') # english stopwords
re_pat = '^a-zA-Z\s\W+' # find all symbols and whitespaces 

def clean_text(text,pattern):
    '''Removes punctuations and stopwords and returns lowercase tokenized text for input text and pattern'''
    text_nospchar = re.sub(pattern,'',text,re.I | re.A) # match regex pattern and replace with empty string
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    text_lower_nospc = text_nopunct.lower().strip() # convert text to lower case and strip white space if any
    token_text = word_tokenize(text_lower_nospc) # convert to tokens 
    no_stop_docs = ' '.join([word for word in token_text if word not in eng_stop]) # list of tokenized words with no stopwords
    
    return no_stop_docs
    
    

In [10]:
cleaner = np.vectorize(clean_text)
corpus_df['cleaned_text'] = cleaner(corpus_df['text'],re_pat)
corpus_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corpus_df['cleaned_text'] = cleaner(corpus_df['text'],re_pat)


Unnamed: 0,text,target,cleaned_text
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or...,1,residents asked shelter place notified officers evacuation shelter place orders expected
3,"13,000 people receive #wildfires evacuation orders in California",1,13000 people receive wildfires evacuation orders california
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfires pours school
5,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAf...,1,rockyfire update california hwy 20 closed directions due lake county fire cafire wildfires
6,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas
7,I'm on top of the hill and I can see a fire in the woods...,1,im top hill see fire woods
8,There's an emergency evacuation happening now in the building across the street,1,theres emergency evacuation happening building across street
9,I'm afraid that the tornado is coming to our area...,1,im afraid tornado coming area


The cleaned text is now devoid of any special characters or stopwords, however its still not ready to be vectorized. The first action required is to tokenize the words, ie, converting the sentence into a list of words, and then, there can be many words that have a similar meaning such search, searching, searched, etc. I will be using a lemmatizer (WordNetLemmatizer) to correlate words with similar meaning and keeps the root words. 

In [11]:
# def tokenize_lemmatize(text):
#     '''Input is a string sentence, returns list of lemmatized tokens'''
    
#     wn = nltk.WordNetLemmatizer() # Instantiating wordnet lemmatizer 
#     tokens = re.split('\W+',text) # split words on white space
#     text_lemmatized = [ wn.lemmatize(word) for word in tokens ] # lemmatize tokens and store as list
#     return text_lemmatized

In [12]:
# corpus_df['text_lemmatized'] = corpus_df['cleaned_text'].apply(lambda x: tokenize_lemmatize(x))
# corpus_df.head()

In [13]:
# Instantiate CountVectorizer
CountVec = CountVectorizer(analyzer='word',ngram_range=(2,2))
# convert text to matrix of token counts
count_matrix = CountVec.fit_transform(corpus_df['cleaned_text'])
count_matrix = count_matrix.toarray()
# convert count matrix to dataframe
count_matrix_df = pd.DataFrame(count_matrix)

In [14]:
# extract headings for count matrix
features = CountVec.get_feature_names()
count_matrix_df.columns = features
count_matrix_df.head(10)

Unnamed: 0,0011 utc,001116 utc20150805,0025 updated,005225 utc20150805,010156 okinawa,010217 okinawa,0104 utc,0104 utc5km,010401 utc20150805,0106 bmw,...,ûó rt,ûó stories,ûó wallybaiter,ûó ûªm,ûóbbc looks,ûóher upper,ûókody vine,ûónegligence fireworks,ûótech business,ûówe work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
X_train = pd.concat([corpus_df,count_matrix_df],axis=1)
y_train = corpus_df['target']

In [18]:
# Instantiate classifier
rf_clf = RandomForestClassifier(n_estimators=50, max_depth = 20, n_jobs=-1)
# fit the model 
rf_model = rf_clf.fit(count_matrix_df, corpus_df['target'])
# predict on test 
y_hat = rf_model.predict(X_test)
# evaluate model 
precision, recall, f1_score, support = score(test_labels['target'],y_hat, pos_label=1)

ValueError: could not convert string to float: 'ablaze'

#### Using TF-IDF vectorizer and XGB classifier 


In [None]:
vect = TfidfVectorizer()
gb = GradientBoostingClassifier()