In [1]:
import pandas as pd
import re
from transformers import BertTokenizer
from datetime import datetime
import emoji
from tqdm import tqdm
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

  from .autonotebook import tqdm as notebook_tqdm


### Load the tokenizer and Dataset

In [2]:
df = pd.read_csv('../Dataset/KaggleDataset.csv')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



#### Minimal Text Cleaning

In [3]:
def clean_text(text):
    text = re.sub(r"http\S+|www\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)            # Remove mentions
    text = re.sub(r"#", '', text)               # Remove hashtags symbol, keep content
    text = re.sub(r'\s+', ' ', text).strip()    # Normalize whitespace
    return text


### Emojis Extraction

In [4]:
def extract_emojis(text):
    return ''.join(c for c in text if c in emoji.EMOJI_DATA)


In [5]:
def get_emoji_score(emojis):
    emoji_sentiment = {
        '😂': 1, '😍': 1, '😊': 1, '😢': -1, '😡': -1, '😭': -1,
        '👍': 1, '💔': -1, '😃': 1, '😤': -1, '😞': -1
    }
    return sum(emoji_sentiment.get(e, 0) for e in emojis)

In [6]:
# Apply basic cleaning
df['clean_text'] = df['tweet'].apply(clean_text)
df['emojis'] = df['tweet'].apply(extract_emojis)
df['emoji_sentiment'] = df['emojis'].apply(get_emoji_score)
df['sentiment'].map({0: 0, 4: 1})

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: sentiment, Length: 1600000, dtype: int64

### Time Feature Extraction

In [7]:
df['created_at'] = pd.to_datetime(df['date'])

df['year'] = df['created_at'].dt.year
df['month'] = df['created_at'].dt.month
df['weekday'] = df['created_at'].dt.weekday
df['hour'] = df['created_at'].dt.hour


  df['created_at'] = pd.to_datetime(df['date'])


In [8]:
df['hour'] = df['created_at'].dt.hour
df['weekday'] = df['created_at'].dt.weekday
features = df[['hour', 'weekday', 'emoji_sentiment']].values
features_tensor = torch.tensor(features, dtype=torch.float)


## **Next Steps: Tokenization + DataLoader Setup**


* `df['clean_text']` → tweets for BERT input
* `features_tensor` → extra features (`hour`, `weekday`, `emoji_sentiment`)
* `df['label']` → target labels (0 = negative, 1 = positive)




In [9]:
input_ids = []
attention_masks = []

for text in tqdm(df['clean_text']):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'][0])
    attention_masks.append(encoded['attention_mask'][0])

# Stack lists into tensors
input_ids = torch.stack(input_ids)
attention_masks = torch.stack(attention_masks)

100%|██████████| 1600000/1600000 [08:04<00:00, 3302.86it/s]


In [10]:
labels = torch.tensor(df['sentiment'].values)


In [11]:
dataset = TensorDataset(input_ids, attention_masks, features_tensor, labels)

# Create DataLoader
batch_size = 16
dataloader = DataLoader(
    dataset,
    sampler=RandomSampler(dataset),
    batch_size=batch_size
)


### save the data

In [12]:
# Save everything as a dictionary
torch.save({
    'input_ids': input_ids,
    'attention_masks': attention_masks,
    'features_tensor': features_tensor,
    'labels': labels
}, '../models/tokenized_dataset.pt')


In [13]:
df_to_save = df[['clean_text', 'emoji_sentiment', 'hour', 'weekday', 'sentiment']].copy()
df_to_save.to_csv('../models/preprocessed_tweets.csv', index=False)
