# Loading Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score

# Setting working directory

In [51]:
os.getcwd()
os.chdir("C:\Python")
os.getcwd()

'C:\\Python'

# Loading the data

In [52]:
train_df = pd.read_csv('train.csv', dtype={"keyword": str, "location":str})
test_df = pd.read_csv('test.csv', dtype={"keyword": str, "location":str})

# Exploratory Data Analysis

In [53]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [54]:
train_df.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [55]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [56]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [57]:
print(train_df.shape)
print(test_df.shape)

(7613, 5)
(3263, 4)


# Missing Value Analysis

In [58]:
#Calculating missing value in train dataset
missing_val_train = pd.DataFrame(train_df.isnull().sum())

In [59]:
missing_val_train

Unnamed: 0,0
id,0
keyword,61
location,2533
text,0
target,0


In [60]:
#As 'location' & 'keyword' variable contain missing values & don't seem statistically signnificant, I'll drop them 
df = train_df.drop(['location','keyword'],axis=1)
test_df = test_df.drop(['location','keyword'],axis=1)

In [61]:
df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


# Data Cleaning

In [62]:
#removing url,html & punctuation
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

In [63]:
#for train
df['text']=df['text'].apply(lambda x : remove_URL(x))
df['text']=df['text'].apply(lambda x : remove_html(x))
#df['text']=df['text'].apply(lambda x: remove_emoji(x))
df['text']=df['text'].apply(lambda x : remove_punct(x))

#for test
test_df['text']=test_df['text'].apply(lambda x : remove_URL(x))
test_df['text']=test_df['text'].apply(lambda x : remove_html(x))
#df['text']=df['text'].apply(lambda x: remove_emoji(x))
test_df['text']=test_df['text'].apply(lambda x : remove_punct(x))

In [64]:
df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this earthquake Ma...,1
1,4,Forest fire near La Ronge Sask Canada,1
2,5,All residents asked to shelter in place are be...,1
3,6,13000 people receive wildfires evacuation orde...,1
4,7,Just got sent this photo from Ruby Alaska as s...,1


In [65]:
test_df.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,Heard about earthquake is different cities sta...
2,3,there is a forest fire at spot pond geese are ...
3,9,Apocalypse lighting Spokane wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


# Tokenization

In [66]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
test_df['text'] = test_df['text'].apply(lambda x: tokenizer.tokenize(x))

In [67]:
df.head()

Unnamed: 0,id,text,target
0,1,"[Our, Deeds, are, the, Reason, of, this, earth...",1
1,4,"[Forest, fire, near, La, Ronge, Sask, Canada]",1
2,5,"[All, residents, asked, to, shelter, in, place...",1
3,6,"[13000, people, receive, wildfires, evacuation...",1
4,7,"[Just, got, sent, this, photo, from, Ruby, Ala...",1


In [68]:
test_df.head()

Unnamed: 0,id,text
0,0,"[Just, happened, a, terrible, car, crash]"
1,2,"[Heard, about, earthquake, is, different, citi..."
2,3,"[there, is, a, forest, fire, at, spot, pond, g..."
3,9,"[Apocalypse, lighting, Spokane, wildfires]"
4,11,"[Typhoon, Soudelor, kills, 28, in, China, and,..."


# Stopward Remove

In [69]:
def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words

df['text'] = df['text'].apply(lambda x : remove_stopwords(x))
test_df['text'] = test_df['text'].apply(lambda x : remove_stopwords(x))

In [70]:
df.head()

Unnamed: 0,id,text,target
0,1,"[Our, Deeds, Reason, earthquake, May, ALLAH, F...",1
1,4,"[Forest, fire, near, La, Ronge, Sask, Canada]",1
2,5,"[All, residents, asked, shelter, place, notifi...",1
3,6,"[13000, people, receive, wildfires, evacuation...",1
4,7,"[Just, got, sent, photo, Ruby, Alaska, smoke, ...",1


In [71]:
test_df.head()

Unnamed: 0,id,text
0,0,"[Just, happened, terrible, car, crash]"
1,2,"[Heard, earthquake, different, cities, stay, s..."
2,3,"[forest, fire, spot, pond, geese, fleeing, acr..."
3,9,"[Apocalypse, lighting, Spokane, wildfires]"
4,11,"[Typhoon, Soudelor, kills, 28, China, Taiwan]"


# Combining Tokens

In [72]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

df['text'] = df['text'].apply(lambda x : combine_text(x))
test_df['text'] = test_df['text'].apply(lambda x : combine_text(x))

In [73]:
df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds Reason earthquake May ALLAH Forgive us,1
1,4,Forest fire near La Ronge Sask Canada,1
2,5,All residents asked shelter place notified off...,1
3,6,13000 people receive wildfires evacuation orde...,1
4,7,Just got sent photo Ruby Alaska smoke wildfire...,1


In [74]:
test_df.head()

Unnamed: 0,id,text
0,0,Just happened terrible car crash
1,2,Heard earthquake different cities stay safe ev...
2,3,forest fire spot pond geese fleeing across str...
3,9,Apocalypse lighting Spokane wildfires
4,11,Typhoon Soudelor kills 28 China Taiwan


# Stemming

In [75]:
#for train
corpus  = []
pstem = PorterStemmer()
for i in range(df['text'].shape[0]):
    #Remove unwanted words
    text = re.sub("[^a-zA-Z]", ' ', df['text'][i])
    #remove words containing numbers
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    #Transform words to lowercase
    text = text.lower()
    text = text.split()
    #Stemming it
    text = [pstem.stem(word) for word in text]
    text = ' '.join(text)
    #Append cleaned tweet to corpus
    corpus.append(text)

In [76]:
corpus

['our deed reason earthquak may allah forgiv us',
 'forest fire near la rong sask canada',
 'all resid ask shelter place notifi offic no evacu shelter place order expect',
 'peopl receiv wildfir evacu order california',
 'just got sent photo rubi alaska smoke wildfir pour school',
 'rockyfir updat california hwi close direct due lake counti fire cafir wildfir',
 'flood disast heavi rain caus flash flood street manit colorado spring area',
 'im top hill i see fire wood',
 'there emerg evacu happen build across street',
 'im afraid tornado come area',
 'three peopl die heat wave far',
 'haha south tampa get flood hah wait a second i live in south tampa what am i gonna do what am i gonna do fvck flood',
 'rain flood florida tampabay tampa day ive lost count',
 'flood bago myanmar we arriv bago',
 'damag school bu multi car crash break',
 'what man',
 'i love fruit',
 'summer love',
 'my car fast',
 'what goooooooaaaaaal',
 'ridicul',
 'london cool',
 'love ski',
 'what wonder day',
 'looo

In [77]:
#for test
corpus_test  = []
pstem1 = PorterStemmer()
for i in range(test_df['text'].shape[0]):
    #remove words containing numbers
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    #Remove unwanted words
    text = re.sub("[^a-zA-Z]", ' ', test_df['text'][i])
    #Transform words to lowercase
    text = text.lower()
    text = text.split()
    #Stemming it
    text = [pstem.stem(word) for word in text]
    text = ' '.join(text)
    #Append cleaned tweet to corpus
    corpus_test.append(text)

In [78]:
corpus_test

['just happen terribl car crash',
 'heard earthquak differ citi stay safe everyon',
 'forest fire spot pond gees flee across street i cannot save',
 'apocalyps light spokan wildfir',
 'typhoon soudelor kill china taiwan',
 'were shakingit earthquak',
 'theyd probabl still show life arsen yesterday eh eh',
 'hey how',
 'what nice hat',
 'fuck',
 'no i dont like cold',
 'nooooooooo dont',
 'no dont tell',
 'what',
 'awesom',
 'birmingham wholesal market ablaz bbc news fire break birmingham wholesal market',
 'sunkxssedharri wear short race ablaz',
 'previouslyondoyintv toke makinwa s marriag crisi set nigerian twitter ablaz',
 'check nsfw',
 'psa i m split person techi follow ablazeco burner follow ablaz',
 'bewar world ablaz sierra leon amp guap',
 'burn man ablaz turban diva via etsi',
 'not diss song peopl take thing run smh eye open though he set game ablaz cyhitheprync',
 'rape victim die set ablaz a yearold girl die burn injuri set ablaz',
 'set myself ablaz',
 'ctvtoronto bin fron

# TF-IDF

In [79]:
Tfidf_vect = TfidfVectorizer(max_features=80000)
Tfidf_vect.fit(df['text'])
uniqueWords = Tfidf_vect.vocabulary_

# Bag of words

In [80]:
cv = CountVectorizer(max_features = len(uniqueWords))
#Create Bag of Words Model for train dataset, here X represent bag of words
X = cv.fit_transform(corpus).todense()
y = df['target'].values

In [81]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2, random_state=4110)

In [None]:
#Create Bag of Words Model for test dataset, here Z represent bag of words
Z = cv.transform(corpus_test).todense()

# Gaussian Naive Bayes

In [82]:
# Fitting Gaussian Naive Bayes to the Training set
classifier_gnb = GaussianNB()
classifier_gnb.fit(X_train, y_train)
# Predicting the Test data set results
y_pred_gnb = classifier_gnb.predict(X_test)

In [83]:
# Making the Confusion Matrix

#Calculating Model Accuracy
print('GaussianNB Classifier Accuracy Score is {} for Train Data Set'.format(classifier_gnb.score(X_train, y_train)))
print('GaussianNB Classifier Accuracy Score is {} for Test Data Set'.format(classifier_gnb.score(X_test, y_test)))
print('GaussianNB Classifier F1 Score is {}'.format(f1_score(y_test, y_pred_gnb)))

GaussianNB Classifier Accuracy Score is 0.8945812807881773 for Train Data Set
GaussianNB Classifier Accuracy Score is 0.6158896913985554 for Test Data Set
GaussianNB Classifier F1 Score is 0.6490701859628074


# K- Nearest neighbour

In [84]:
# Fitting K- Nearest neighbour to the Training set
classifier_knn = KNeighborsClassifier(n_neighbors = 7,weights = 'distance',algorithm = 'brute')
classifier_knn.fit(X_train, y_train)
# Predicting the Test data set results
y_pred_knn = classifier_knn.predict(X_test)

#Calculating Model Accuracy
print('K-Nearest Neighbour Model Accuracy Score for Train Data set is {}'.format(classifier_knn.score(X_train, y_train)))
print('K-Nearest Neighbour Model Accuracy Score for Test Data set is {}'.format(classifier_knn.score(X_test, y_test)))
print('K-Nearest Neighbour Model F1 Score is {}'.format(f1_score(y_test, y_pred_knn)))

K-Nearest Neighbour Model Accuracy Score for Train Data set is 0.9873563218390805
K-Nearest Neighbour Model Accuracy Score for Test Data set is 0.701247537754432
K-Nearest Neighbour Model F1 Score is 0.505971769815418


#### As, my preferennce is accuracy in this problem, knnn is the preferred method

# Predictions

In [86]:
z_pred_knn = classifier_knn.predict(Z)

In [87]:
z_pred_knn

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [88]:
z_pred_knn = pd.DataFrame(z_pred_knn)

In [89]:
z_pred_knn

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
3258,0
3259,1
3260,0
3261,1


In [90]:
z_pred_knn['predictions'] = z_pred_knn[0]

In [91]:
z_pred_knn

Unnamed: 0,0,predictions
0,0,0
1,0,0
2,0,0
3,0,0
4,1,1
...,...,...
3258,0,0
3259,1,1
3260,0,0
3261,1,1


In [97]:
test_df['target'] = z_pred_knn['predictions']

In [98]:
test_df.head()

Unnamed: 0,id,predictions,target
0,0,0,0
1,2,0,0
2,3,0,0
3,9,0,0
4,11,1,1


In [100]:
test_df = test_df.drop(['predictions'],axis=1)

In [101]:
test_df

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,0
3261,10874,1


In [102]:
#creating output file
test_df.to_csv("predictions.csv", index = False)