<a href="https://colab.research.google.com/github/tanmayb104/NLP-with-Disaster-Tweets/blob/main/NLP_Disaster_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

**Read Dataset**

In [3]:
train = pd.read_csv('sample_data/train.csv')
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('sample_data/test.csv')
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


**Clean the dataset**

In [5]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [6]:
corpus = []
for i in range(len(train)):
    text = re.sub('[^a-zA-Z]', ' ', train['text'][i])
    text = text.lower()
    text = text.split()
    text = [wordnet.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

In [7]:
corpus1 = []
for i in range(len(test)):
    text1 = re.sub('[^a-zA-Z]', ' ', test['text'][i])
    text1 = text1.lower()
    text1 = text1.split()
    text1 = [wordnet.lemmatize(word) for word in text1 if word not in set(stopwords.words('english'))]
    text1 = ' '.join(text1)
    corpus1.append(text1)

**Bag Of Words**

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

y = pd.get_dummies(train['target'])
y = y.iloc[:, 1].values

cv = CountVectorizer(max_features=2500)
test_x = cv.fit_transform(corpus1).toarray()

**Tf-Idf**

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(max_features=2500)
X1 = cv.fit_transform(corpus).toarray()

y1 = pd.get_dummies(train['target'])
y1 = y1.iloc[:, 1].values

cv = TfidfVectorizer(max_features=2500)
test_x1 = cv.fit_transform(corpus1).toarray()

**Confusion matrix and accuracy**

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

def calculate(y_test,y_pred):
  confusion_m = confusion_matrix(y_test, y_pred)
  accuracy = accuracy_score(y_test, y_pred)
  print(confusion_m)
  print(accuracy)

**Naive Bayes**

In [11]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X1, y1)

y_pred = spam_detect_model.predict(test_x1)
y_pred

array([0, 0, 1, ..., 0, 0, 1], dtype=uint8)

**Logistic Regression**

In [12]:
from sklearn.linear_model import LogisticRegression

logisticR = LogisticRegression()
logisticR.fit(X1, y1)

y_pred = logisticR.predict(test_x1)
y_pred

array([0, 0, 0, ..., 0, 0, 1], dtype=uint8)

**Random Forest Classifier**

In [13]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X1, y1)

y_pred = rfc.predict(test_x1)
y_pred

array([0, 0, 0, ..., 1, 0, 1], dtype=uint8)

**Make submission file**

In [8]:
def convert(y_pred,name):
  data={"id":test["id"].tolist(),"target":y_pred}
  df = pd.DataFrame(data)
  df.to_csv(f"{name}.csv",index=False)

**Word Embeddings**

In [9]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Flatten, Dropout
from keras.layers.embeddings import Embedding
from tensorflow.keras.layers import LSTM

In [10]:
a=set()
for i in corpus:
  for j in i.split():
    a.add(j)
for i in corpus1:
  for j in i.split():
    a.add(j)
print(len(a))


26781


In [13]:
vocab_size = 50000
encoded_docs = [one_hot(d, vocab_size) for d in corpus]
print(encoded_docs)

[[1135, 21626, 25992, 30391, 40535, 23548, 29727], [37774, 7548, 12355, 40416, 34239, 41716, 9840], [28524, 5213, 39232, 37049, 42941, 3613, 13597, 39232, 37049, 21509, 45728], [45106, 31246, 30083, 13597, 21509, 10646], [34060, 33721, 26148, 27762, 17932, 7731, 30083, 3549, 49448], [24753, 5353, 10646, 44718, 26202, 16252, 44618, 44256, 43211, 7548, 21753, 30083], [46731, 44289, 45611, 2718, 6196, 37611, 13070, 34028, 4694, 18773, 19410, 35876], [47270, 12122, 33917, 7548, 32322], [32582, 13597, 26714, 38212, 15185, 34028], [5642, 15845, 8507, 35876], [47807, 45106, 19498, 39602, 5124, 21943], [14313, 43129, 28010, 11803, 6540, 11323, 27111, 23590, 7571, 43129, 28010, 2413, 2413, 32163, 13070], [19349, 13070, 33785, 5230, 28010, 8942, 23067, 32122], [46731, 14501, 29379, 9183, 14501], [9411, 49448, 40430, 9128, 15150, 5701, 33072], [22194], [25541, 19561], [24490, 28054], [15150, 8387], [37827], [33135], [18618, 13609], [25541, 36827], [43581, 8942], [25922], [13789, 40267, 39710], [4

In [14]:
encoded_docs1 = [one_hot(d, vocab_size) for d in corpus1]
print(encoded_docs1)

[[9886, 43416, 15150, 5701], [25292, 25992, 48716, 30354, 24618, 21470, 40125], [37774, 7548, 26558, 28348, 40068, 26037, 15185, 34028, 42047, 3909], [49165, 8021, 5572, 30083], [22982, 37778, 27680, 38965, 7506], [34588, 25992], [11975, 6051, 15926, 24482, 48861, 14419, 49336, 49336], [14489], [23624, 41604], [35742], [36081, 46938], [39618], [4907], [], [18442], [28436, 10189, 47693, 28426, 13539, 33691, 7548, 22248, 28436, 10189, 47693, 30995, 36874, 9034], [22999, 45145, 49434, 16576, 28426], [47369, 34246, 24269, 42963, 43301, 48038, 5330, 47011, 28426, 30995, 36874, 21799, 19929], [7676, 30995, 36874, 16637, 39491, 30995, 36874, 6605, 36871, 30995, 36874, 2186, 30995, 36874, 47046, 40553, 46007], [35821, 2396, 3228, 3926, 26084, 28426, 36874, 21804, 26084, 28426], [22301, 13397, 28426, 3547, 49919, 5176, 27322], [28285, 22194, 28426, 31363, 18695, 30995, 36874, 41279, 40989, 12853], [14658, 14135, 45106, 26564, 18268, 19882, 8911, 25892, 33407, 16703, 48038, 45216, 28426, 38936],

In [15]:
# pad documents to a max length
max_length = 0
for i in encoded_docs:
  max_length = max(max_length,len(i))
for i in encoded_docs1:
  max_length = max(max_length,len(i))
print(max_length)

27


In [38]:
max_length = 30
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 1135 21626 25992 ...     0     0     0]
 [37774  7548 12355 ...     0     0     0]
 [28524  5213 39232 ...     0     0     0]
 ...
 [25189 33718  9483 ...     0     0     0]
 [32450 23646 38920 ...     0     0     0]
 [ 7402 49883 22000 ...     0     0     0]]


In [39]:
padded_docs1 = pad_sequences(encoded_docs1, maxlen=max_length, padding='post')
print(padded_docs1)

[[ 9886 43416 15150 ...     0     0     0]
 [25292 25992 48716 ...     0     0     0]
 [37774  7548 26558 ...     0     0     0]
 ...
 [45346 36368 44189 ...     0     0     0]
 [14381  3249 25311 ...     0     0     0]
 [16926  5909 34001 ...     0     0     0]]


In [40]:
# define the model
embedding_vector_features = 100
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_features, input_length=max_length))
# model.add(Flatten())
# model.add(Dropout(0.3))
model.add(LSTM(100))
# model.add(Dropout(0.3))
model.add(Dense(1, activation='relu'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# summarize the model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 100)           5000000   
                                                                 
 lstm_2 (LSTM)               (None, 100)               80400     
                                                                 
 dense_2 (Dense)             (None, 1)                 101       
                                                                 
Total params: 5,080,501
Trainable params: 5,080,501
Non-trainable params: 0
_________________________________________________________________
None


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.2, random_state=42)

In [41]:
# fit the model
model.fit(padded_docs, y, epochs=2, verbose=1)

# model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6154ffb8d0>

In [None]:
import numpy
y_pred=model.predict(X_test)
final_y_pred=[]
for i in range(len(y_pred)):
  final_y_pred.append(int(numpy.round(y_pred[i])))
final_y_pred

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,final_y_pred)


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,final_y_pred)

In [44]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, y, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.228952


In [45]:
ans = model.predict(padded_docs1)
ans

array([[0.49888748],
       [0.4929992 ],
       [0.8512293 ],
       ...,
       [0.5073747 ],
       [1.3166854 ],
       [0.55112267]], dtype=float32)

In [46]:
print(max(ans), min(ans))

[1.7576863] [0.]


In [36]:
import numpy
final=[]
for i in range(len(ans)):
  final.append(int(numpy.round(ans[i])))
final

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [37]:
convert(final,"sub21")