In [56]:
import pandas as pd
import numpy as np

## Downloading the Data from Source

In [57]:
df=pd.read_csv('train.csv')

## Data Exploration

In [58]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [59]:
df.shape

(7613, 5)

In [60]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [61]:
df.drop(['id','keyword','location'],axis=1,inplace=True)

In [88]:
df.columns

Index(['text', 'target', 'clean_text'], dtype='object')

In [63]:
df['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

## Preprocessing the Data

In [64]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [65]:
# Download NLTK resources
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vallirajasekar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vallirajasekar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vallirajasekar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [66]:
def preprocess_text(text):
    # Convert to Lowercase
    text=text.lower()
    # Remove punctuation
    text=re.sub(r'[^\w\s]','',text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)
    

In [67]:
df['clean_text'] = df['text'].apply(preprocess_text)


In [68]:
df['text'][:2]

0    Our Deeds are the Reason of this #earthquake M...
1               Forest fire near La Ronge Sask. Canada
Name: text, dtype: object

## Converting into Vector

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Vectorize the 'clean_text' column
X = vectorizer.fit_transform(df['clean_text'])

# Target variable
y = df['target']


In [70]:
import joblib

In [71]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

## Building the Model

In [72]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [74]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the classifier
clf = MultinomialNB()

# Train the classifier
clf.fit(X_train, y_train)


MultinomialNB()

In [75]:
joblib.dump(clf, 'multinomial_nb_classifier.pkl')


['multinomial_nb_classifier.pkl']

In [76]:
# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))


Accuracy: 0.8010505581089954
              precision    recall  f1-score   support

           0       0.79      0.88      0.84       874
           1       0.82      0.69      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



## Predicting for New Word

## Preprocessing the New Tweet

In [77]:
new_tweet = "beware world ablaze sierra leone &amp; guap."

# Preprocess the new tweet
clean_new_tweet = preprocess_text(new_tweet)


In [78]:
X_new = vectorizer.transform([clean_new_tweet])


In [79]:
# Predict the sentiment of the new tweet
predicted_sentiment = clf.predict(X_new)

# Map the predicted sentiment to human-readable label
sentiment_label = "Real Disaster" if predicted_sentiment == 1 else "Not a Real Disaster"

print("Predicted Sentiment:", sentiment_label)


Predicted Sentiment: Not a Real Disaster


## Preprocessing the Next text

In [80]:
df_test=pd.read_csv('/Users/vallirajasekar/Desktop/NLP_Challenge/Disaster_Tweet/test.csv')

In [81]:
df_test.drop(['id','keyword','location'],axis=1,inplace=True)

In [82]:
df_test

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


In [83]:
df_test['clean_text'] = df_test['text'].apply(preprocess_text)


In [84]:
X_test_new = vectorizer.transform(df_test['clean_text'])


In [85]:
# Predict the sentiment for the new tweets
y_pred_new = clf.predict(X_test_new)

# Map the predicted sentiment to human-readable labels
predicted_sentiments = ["Real Disaster" if sentiment == 1 else "Not a Real Disaster" for sentiment in y_pred_new]

# Add the predicted sentiments to df_test
df_test['predicted_sentiment'] = predicted_sentiments

# Display the DataFrame with predicted sentiments
#print(df_test[['text', 'predicted_sentiment']])


In [86]:
df_test

Unnamed: 0,text,clean_text,predicted_sentiment
0,Just happened a terrible car crash,happened terrible car crash,Real Disaster
1,"Heard about #earthquake is different cities, s...",heard earthquake different city stay safe ever...,Real Disaster
2,"there is a forest fire at spot pond, geese are...",forest fire spot pond goose fleeing across str...,Real Disaster
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire,Real Disaster
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill 28 china taiwan,Real Disaster
...,...,...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,earthquake safety los angeles ûò safety fasten...,Real Disaster
3259,Storm in RI worse than last hurricane. My city...,storm ri worse last hurricane cityamp3others h...,Not a Real Disaster
3260,Green Line derailment in Chicago http://t.co/U...,green line derailment chicago httptcoutbxlcbiuy,Real Disaster
3261,MEG issues Hazardous Weather Outlook (HWO) htt...,meg issue hazardous weather outlook hwo httptc...,Real Disaster
