In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\charl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Load data
train_data = pd.read_csv('train.csv')

# Check that the data was loaded properly
print(train_data.head())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [3]:
#Clean the data before testing

stop_words = set(stopwords.words('english'))

# Fill missing keywords with 'no_keyword'
train_data['keyword'] = train_data['keyword'].fillna('no_keyword')

# Fill missing locations with 'unknown_location'
train_data['location'] = train_data['location'].fillna('unknown_location')

# Convert to lowercase
train_data['text'] = train_data['text'].str.lower()

# Remove URLs
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x, flags=re.MULTILINE))

# Remove special characters and numbers
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'\W', ' ', x))
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'\d', '', x))

# Remove extra spaces
train_data['text'] = train_data['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Tokenize and remove stopwords
train_data['text'] = train_data['text'].apply(
    lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words])
)

# Check that the data was cleaned properly
print(train_data.head())

   id     keyword          location  \
0   1  no_keyword  unknown_location   
1   4  no_keyword  unknown_location   
2   5  no_keyword  unknown_location   
3   6  no_keyword  unknown_location   
4   7  no_keyword  unknown_location   

                                                text  target  
0       deeds reason earthquake may allah forgive us       1  
1              forest fire near la ronge sask canada       1  
2  residents asked shelter place notified officer...       1  
3  people receive wildfires evacuation orders cal...       1  
4  got sent photo ruby alaska smoke wildfires pou...       1  


In [10]:
# Add keyword to text
train_data['text'] = train_data['keyword'] + ' ' + train_data['text']

# Initialize vectorizer
tfidf = TfidfVectorizer(max_features= 5000, ngram_range = (1,2), min_df = 5)

# Fit and transform training text data
X_train = tfidf.fit_transform(train_data['text']).toarray()
y_train = train_data['target']

# Split the training data
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state= 6
)

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state= 6)

# Train model on training split
rf_model.fit(X_train_split, y_train_split)

# Predict on validation set
y_pred = rf_model.predict(X_val)

# Evaluate performance
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.88      0.82       864
           1       0.81      0.67      0.73       659

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.78      1523

