In [65]:
import pandas as pd
import numpy as np 
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import re
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yagya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yagya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yagya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [33]:
data=pd.read_csv('nlp_disaster_tweets_train.csv')
x_test=pd.read_csv('nlp_disaster_tweets_test.csv')

In [34]:
y_train=data['target']
data=data.iloc[:,:-1]


# Exploratory Analysis

In [7]:
len(data)

7613

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
dtypes: int64(1), object(3)
memory usage: 238.0+ KB


In [9]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
dtype: int64

In [10]:
data.describe()

Unnamed: 0,id
count,7613.0
mean,5441.934848
std,3137.11609
min,1.0
25%,2734.0
50%,5408.0
75%,8146.0
max,10873.0


In [11]:
data.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

In [12]:
data['location'].value_counts()

USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: location, Length: 3341, dtype: int64

In [13]:
data['keyword'].value_counts()

fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [14]:
data.drop(columns=['location'],inplace=True)

In [15]:
#Train data
n=0
for n in range(len(data)):
    data['keyword'][n]=str(data['keyword'][n])
    n=n+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['keyword'][n]=str(data['keyword'][n])


In [35]:
#Test data
n=0
for n in range(len(x_test)):
    x_test['keyword'][n]=str(x_test['keyword'][n])
    n=n+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['keyword'][n]=str(x_test['keyword'][n])


In [16]:
#Train data
i=0
concat_texts=[]
for i in range(len(data)):
    concat_texts.append(" ".join((data['text'][i],data['keyword'][i])))
    i=i+1

In [36]:
#Test data
i=0
concat_texts_test=[]
for i in range(len(x_test)):
    concat_texts_test.append(" ".join((x_test['text'][i],x_test['keyword'][i])))
    i=i+1

# Tokenization

In [18]:
# Train data
tokenized_sentences=[]
for sent in concat_texts:
    tokenized_sentences.append(word_tokenize(sent))

In [37]:
# Test data
tokenized_sentences_test=[]
for sent in concat_texts_test:
    tokenized_sentences_test.append(word_tokenize(sent))

# Lower case conversion 

In [19]:
#Train data
for sent in tokenized_sentences:
    m=0
    while m<len(sent):
        sent[m] = re.sub(r"[^a-zA-Z0-9]", " ", sent[m].lower())
        m=m+1

In [38]:
#Test data
for sent in tokenized_sentences_test:
    m=0
    while m<len(sent):
        sent[m] = re.sub(r"[^a-zA-Z0-9]", " ", sent[m].lower())
        m=m+1

# Stopwords removal

In [20]:
stopwords_en=stopwords.words('english')
stopwords_en.append('nan')

In [22]:
#Train data
words=[]
for sent in tokenized_sentences:
    words.append([w for w in sent if w not in stopwords_en])

In [39]:
#Test data
words_test=[]
for sent in tokenized_sentences_test:
    words_test.append([w for w in sent if w not in stopwords_en])

# Stemming

In [23]:
#Train data
for sent in words:
    i=0
    while i<len(sent):
        sent[i]=(PorterStemmer().stem(sent[i]))
        i=i+1 

In [40]:
#Test data
for sent in words_test:
    i=0
    while i<len(sent):
        sent[i]=(PorterStemmer().stem(sent[i]))
        i=i+1 

# Vectorizing

In [25]:
all_words=[]
for sent in words:
    i=0
    while i<len(sent):
        all_words.append(sent[i])
        i=i+1

In [26]:
words_freq={}
for word in all_words:
    if word not in words_freq.keys():
        words_freq[word]=1
    elif word in words_freq.keys():
        words_freq[word]+=1
        

In [27]:
import heapq
most_freq = heapq.nlargest(10000, words_freq, key=words_freq.get)


In [28]:
#Train data
sentence_vectors = []
for sentence in words:
    sent_vec = []
    for token in most_freq:
        if token in sentence:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

In [72]:
#Test data
sentence_vectors_test = []
for sentence in words_test:
    sent_vec_test = []
    for token in most_freq:
        if token in sentence:
            sent_vec_test.append(1)
        else:
            sent_vec_test.append(0)
    sentence_vectors_test.append(sent_vec_test)

# Developing models

# KNN

In [79]:
knn=KNeighborsClassifier(n_neighbors=3)

In [74]:
knn.fit(sentence_vectors,y_train)

KNeighborsClassifier(n_neighbors=3)

In [75]:
preds=knn.predict(sentence_vectors_test)

In [76]:
preds=pd.DataFrame(preds)

In [77]:
preds.value_counts()

0    2388
1     875
dtype: int64

In [78]:
preds.to_csv('preds_knn.csv')

# Logistic regression

In [80]:
lr=LogisticRegression()
lr.fit(sentence_vectors,y_train)
lr_preds=lr.predict(sentence_vectors_test)

In [81]:
lr_preds=pd.DataFrame(lr_preds)

In [82]:
lr_preds.value_counts()

0    2055
1    1208
dtype: int64

In [83]:
lr_preds.to_csv('preds_lr.csv')