In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW  # ✅ Corrected this line
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import re

In [None]:
# Clone the dataset repository
!git clone https://github.com/Dong-UTIL/Natural-Hazards-Twitter-Dataset.git

# Load the dataset
file_path = "/content/Natural-Hazards-Twitter-Dataset/2017Hurricane_Summary.csv"
df = pd.read_csv(file_path, encoding="utf-8")
df = df.rename(columns={"text": "tweet", "label": "sentiment"})
df.dropna(inplace=True)


Cloning into 'Natural-Hazards-Twitter-Dataset'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 24 (delta 2), reused 10 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (24/24), 2.55 MiB | 7.49 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [None]:
# Enhanced text cleaning
def enhanced_clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s!?]', '', text)
    text = re.sub(r'(\!)\1+', r'\1', text)
    text = re.sub(r'(\?)\1+', r'\1', text)
    contractions = {
        r"won\'t": "will not", r"can\'t": "can not", r"n\'t": " not",
        r"\'re": " are", r"\'s": " is", r"\'d": " would",
        r"\'ll": " will", r"\'t": " not", r"\'ve": " have", r"\'m": " am"
    }
    for pat, repl in contractions.items():
        text = re.sub(pat, repl, text)
    return text.strip()

df.dropna(inplace=True)

# Clean tweets
df['clean_text'] = df['tweet'].apply(enhanced_clean_text)

In [None]:
label_map = {label: idx for idx, label in enumerate(df['sentiment'].unique())}
df['label'] = df['sentiment'].map(label_map)


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['clean_text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {'labels': torch.tensor(self.labels[idx])}


In [None]:
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
from tqdm import tqdm

model.train()
for epoch in range(3):
    print(f"Epoch {epoch + 1}")
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


Epoch 1


100%|██████████| 392/392 [50:04<00:00,  7.67s/it]


Epoch 2


100%|██████████| 392/392 [49:47<00:00,  7.62s/it]


Epoch 3


100%|██████████| 392/392 [49:58<00:00,  7.65s/it]


In [None]:
model.save_pretrained("saved_model/")
tokenizer.save_pretrained("saved_model/")

('saved_model/tokenizer_config.json',
 'saved_model/special_tokens_map.json',
 'saved_model/vocab.txt',
 'saved_model/added_tokens.json')

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Set model to evaluation mode
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch['labels'].cpu().numpy()

        predictions.extend(preds)
        true_labels.extend(labels)

# Compute accuracy
acc = accuracy_score(true_labels, predictions)
print(f"\nValidation Accuracy: {acc:.4f}\n")


Validation Accuracy: 0.9137



In [None]:
# Reverse the mapping: from int label -> string label
idx_to_label = {
    0: 'Negative',
    1: 'Positive'
}

target_names = [idx_to_label[i] for i in sorted(idx_to_label)]

# Now generate the classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=target_names))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.92      0.94      0.93       927
    Positive       0.90      0.88      0.89       638

    accuracy                           0.91      1565
   macro avg       0.91      0.91      0.91      1565
weighted avg       0.91      0.91      0.91      1565



In [None]:
import pandas as pd
import plotly.express as px

#  Convert timestamp to proper date first
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')  # make sure it's float
df['date'] = pd.to_datetime(df['timestamp'], unit='D', origin='1899-12-30').dt.date


# Your needs_keywords list
needs_keywords = ['food', 'water', 'shelter', 'medical', 'help', 'rescue', 'electricity']

# Step 1: Create binary columns for each essential need
for keyword in needs_keywords:
    df[keyword] = df['tweet'].str.contains(keyword, case=False, na=False).astype(int)

# Step 2: Group by date and sum keyword occurrences
keyword_time = df.groupby('date')[needs_keywords].sum().reset_index()

# Step 3: Melt the DataFrame into long format for Plotly
keyword_melt = pd.melt(keyword_time, id_vars='date', var_name='Need', value_name='Count')

# Step 4: Ensure all combinations of date and Need are present
all_dates = df['date'].unique()
full_index = pd.MultiIndex.from_product([all_dates, needs_keywords], names=['date', 'Need'])
keyword_melt = keyword_melt.set_index(['date', 'Need']).reindex(full_index, fill_value=0).reset_index()

# Step 5: Create the animated bar chart
fig = px.bar(
    keyword_melt,
    x='Need',
    y='Count',
    color='Need',
    animation_frame='date',
    title='📊 Frequency of Essential Needs in Tweets Over Time',
    range_y=[0, keyword_melt['Count'].max() + 50],
    labels={'Count': 'Frequency'},
)

# Step 6: Styling the chart to match the screenshot
fig.update_layout(
    xaxis_title="Essential Need",
    yaxis_title="Frequency",
    plot_bgcolor='rgba(240, 246, 255, 1)',
    paper_bgcolor='white',
    legend_title='Need',
    font=dict(family="Arial", size=14),
    transition_duration=500,
)

fig.update_traces(marker=dict(line=dict(width=0)))  # Optional: removes borders from bars

fig.show()


In [None]:
def predict_tweet_sentiment(tweet):
    model.eval()  # Set model to evaluation mode

    # Clean the tweet
    clean_tweet = enhanced_clean_text(tweet)

    # Tokenize
    encoding = tokenizer(clean_tweet, return_tensors='pt', truncation=True, padding=True, max_length=128)
    encoding = {k: v.to(device) for k, v in encoding.items()}

    # Predict
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()

    # Map prediction to label
    predicted_label = idx_to_label[prediction]
    return predicted_label


In [None]:
tweet_input = input("Enter a tweet: ")
print("Predicted Sentiment:", predict_tweet_sentiment(tweet_input))


Enter a tweet: So grateful for all the volunteers and first responders working together to help our community recover after the storm. Seeing everyone support each other gives me hope! #CommunityStrong
Predicted Sentiment: Positive
