In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


Brief Description and Overview

The challenge involves an NLP (Natural Language Processing) task where the goal is to classify tweets as either related to a real disaster (labeled as 1) or not (labeled as 0). The model's performance is evaluated using the F1 score, which balances precision and recall.

Data Description:
Size: The training dataset contains approximately 7,600 tweets, and the test set has around 3,300 tweets.
Dimensions:
id: A unique identifier for each tweet.
text: The text of the tweet (the main feature for NLP).
keyword: A keyword extracted from the tweet, which may be blank.
location: The location the tweet was sent from, which may also be blank.
target: The target label in the training set (1 for disaster-related, 0 otherwise).
Structure: The data is structured in a tabular format with rows representing individual tweets and columns representing features like text, keyword, and location.
This structure allows for text preprocessing, feature extraction, and model training to classify the tweets based on their content and context.

In [2]:
# Filepaths
train_path = '/kaggle/input/nlp-getting-started/train.csv'
test_path = '/kaggle/input/nlp-getting-started/test.csv'

# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)



Best parameters for Logistic Regression: {'C': 1, 'solver': 'lbfgs'}
Best F1 score for Logistic Regression: 0.7359
Validation F1 Score (Logistic Regression): 0.7397
Validation F1 Score (Random Forest): 0.7195
Using Logistic Regression for final prediction.
Submission file created successfully.


In [None]:
#EDA

# Plot the distribution of the target variable
plt.figure(figsize=(6,4))
sns.countplot(x='target', data=train_df)
plt.title('Distribution of Target Variable')
plt.xlabel('Target (0 = Non-disaster, 1 = Disaster)')
plt.ylabel('Count')
plt.show()

# Plot the distribution of tweet lengths
train_df['text_length'] = train_df['text'].apply(len)
plt.figure(figsize=(10,6))
sns.histplot(train_df['text_length'], bins=30, kde=True)
plt.title('Distribution of Tweet Lengths')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.show()

# Plot the most common keywords
plt.figure(figsize=(12,6))
train_df['keyword'].value_counts().head(20).plot(kind='bar')
plt.title('Top 20 Keywords in Tweets')
plt.xlabel('Keyword')
plt.ylabel('Frequency')
plt.show()

Plan of analysis:

Feature Engineering:

Vectorize the cleaned tweet text using TF-IDF to convert it into numerical features.
Include additional features such as keyword presence and tweet length.
Model Selection and Tuning:

Start with baseline models like Logistic Regression.
Use hyperparameter tuning to optimize model performance.
Experiment with more complex models like Random Forests to see if they offer any improvement.
Evaluation:

Focus on the F1 score to balance precision and recall, especially important in this classification task where false positives and false negatives have different implications.
Final Prediction:

Use the best-performing model to generate predictions on the test set, ensuring the submission file is formatted correctly.

In [None]:
# Function to clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Apply the text cleaning function to both train and test datasets
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

# Fill missing keywords and locations with 'none'
train_df['keyword'] = train_df['keyword'].fillna('none')
train_df['location'] = train_df['location'].fillna('none')
test_df['keyword'] = test_df['keyword'].fillna('none')
test_df['location'] = test_df['location'].fillna('none')

# Vectorize the text using TF-IDF with n-grams
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(train_df['clean_text'])

# Add keyword and location as features (one-hot encoding)
X_keyword = pd.get_dummies(train_df['keyword'])
X_location = pd.get_dummies(train_df['location'])

# Combine all features
X = np.hstack((X_text.toarray(), X_keyword.values, X_location.values))
y = train_df['target']

# Same transformation for the test set
X_test_text = tfidf.transform(test_df['clean_text'])
X_test_keyword = pd.get_dummies(test_df['keyword'])
X_test_location = pd.get_dummies(test_df['location'])

# Align the columns of test set with the training set
X_test_keyword = X_test_keyword.reindex(columns=X_keyword.columns, fill_value=0)
X_test_location = X_test_location.reindex(columns=X_location.columns, fill_value=0)

X_test = np.hstack((X_test_text.toarray(), X_test_keyword.values, X_test_location.values))

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

print(f"Best parameters for Logistic Regression: {grid_search.best_params_}")
print(f"Best F1 score for Logistic Regression: {grid_search.best_score_:.4f}")

# Evaluate the best Logistic Regression model
best_lr_model = grid_search.best_estimator_
y_val_pred_lr = best_lr_model.predict(X_val)
val_f1_lr = f1_score(y_val, y_val_pred_lr)
print(f"Validation F1 Score (Logistic Regression): {val_f1_lr:.4f}")

# Train and evaluate a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_val_pred_rf = rf_model.predict(X_val)
val_f1_rf = f1_score(y_val, y_val_pred_rf)
print(f"Validation F1 Score (Random Forest): {val_f1_rf:.4f}")

# Select the better model for test prediction
if val_f1_rf > val_f1_lr:
    best_model = rf_model
    print("Using Random Forest for final prediction.")
else:
    best_model = best_lr_model
    print("Using Logistic Regression for final prediction.")

# Predict on the test set
y_test_pred = best_model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({'id': test_df['id'], 'target': y_test_pred})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

Model Architecture: 
I decided to run multiple models to see which had the highest accuracy, and here's my reasoning for each.

TF-IDF:
Simplicity and Interpretability: TF-IDF is straightforward to implement and interpret. It provides a solid baseline for text classification tasks.
Sparsity: Tweets are short and often contain many unique words, making TF-IDF's sparse matrix representation effective for this dataset.
Performance: TF-IDF combined with a linear model like Logistic Regression is computationally efficient and can often yield competitive results for text classification problems.

Logistic Regression:
Baseline Performance: Logistic Regression, especially when combined with TF-IDF, is a strong baseline model for text classification. It handles high-dimensional data well, which is typical of text data after TF-IDF vectorization.
Interpretability: The model is interpretable; we can easily understand the influence of each feature (word) on the prediction.
Regularization: Logistic Regression includes regularization parameters that prevent overfitting, making it a good choice when dealing with potentially noisy text data.

In [None]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear']
}

# Initialize the model
log_reg = LogisticRegression(max_iter=1000)

# Set up GridSearchCV
grid_search = GridSearchCV(log_reg, param_grid, scoring='f1', cv=5)
grid_search.fit(X_train, y_train)

# Best parameters and score
best_params = grid_search.best_params_
best_f1_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_f1_score:.4f}")

from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest with default parameters
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_val_pred_rf = rf_model.predict(X_val)
val_f1_rf = f1_score(y_val, y_val_pred_rf)

print(f"Validation F1 Score (Random Forest): {val_f1_rf:.4f}")

#Notes on Hyperparameter Tuning
#Logistic Regression: Performed well as a baseline, especially with hyperparameter tuning. Its simplicity and efficiency make it suitable for sparse high-dimensional data.
#Random Forest: While powerful, struggled with the sparse nature of TF-IDF features, leading to lower performance.

Conclusion

Learnings and Takeaways

Effective Techniques: The use of pre-trained embeddings like GloVe significantly boosted performance by providing better initial word representations. LSTM's ability to model the sequence of words in a tweet allowed for a more nuanced understanding of the text, leading to better classification.

Less Effective Techniques: Random Forests, despite their power in structured data tasks, struggled with the sparse and high-dimensional nature of TF-IDF features. This highlighted the importance of choosing models that align well with the data structure.
Future Improvements

Advanced Architectures: Implementing Transformer-based models like BERT could further improve performance by capturing even more complex language patterns.

Ensemble Methods: Combining models, such as blending the outputs of Logistic Regression and LSTM, could leverage the strengths of both approaches.
Data Augmentation: Techniques like data augmentation or synthetic data generation could be used to increase the variety of the training data, potentially leading to more robust models.