In [2]:
!kaggle datasets download bittlingmayer/amazonreviews

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /Users/yuvrajsingh/Desktop/Learning/Text/Scratch_Implementations
100%|███████████████████████████████████████▉| 493M/493M [00:48<00:00, 10.0MB/s]
100%|████████████████████████████████████████| 493M/493M [00:48<00:00, 10.7MB/s]


In [4]:
!unzip "amazonreviews.zip"

Archive:  amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


**SSL Certificates: A Security Shield for NLTK Data**

SSL certificates play a crucial role in ensuring the secure transmission of data, including NLTK packages. By verifying the identity of the server and encrypting the data, SSL certificates protect against:

* **Man-in-the-Middle Attacks:** Prevents unauthorized interception and modification of data.
* **Data Tampering:** Ensures the integrity of downloaded NLTK packages.
* **Malicious Code Injection:** Safeguards against the introduction of harmful software.

**Ignoring SSL Certificate Errors: A Risky Proposition**

Disregarding SSL certificate errors can expose your system to significant security vulnerabilities. It's strongly advised to address these errors by:

* **Updating System Certificates:** Ensures you have the latest trusted certificates.
* **Installing Certificates for Python Environment (Use with Caution):** A less secure alternative that should only be used if updating system certificates fails.

By prioritizing SSL certificate verification, you can maintain the security and reliability of your NLTK environment.

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import one_hot

import seaborn as sns
from rich import print
import pandas as pd
import re

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [18]:
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm") 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 11.4 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.



[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: pip install --upgrade pip


In [11]:
# Load the data (ensure the path matches your local setup)
train_data = pd.read_csv('train.ft.txt.bz2', compression='bz2', delimiter='\t', header=None).sample(15000, random_state=1000)
test_data = pd.read_csv('test.ft.txt.bz2', compression='bz2', delimiter='\t', header=None).sample(5000, random_state=1000)

In [12]:
# Resetting the index
train_data.reset_index(drop=True,inplace=True)
test_data.reset_index(drop=True,inplace=True)

train_data.rename(columns={0: 'raw_text'}, inplace=True)
test_data.rename(columns={0: 'raw_text'}, inplace=True)

In [13]:
# Extracting the labels from the training and test data
train_data['label'] = train_data['raw_text'].str.extract(r'(__label__\d+)')[0]
train_data['text'] = train_data['raw_text'].str.replace(r'__label__\d+ ', '', regex=True)

test_data['label'] = test_data['raw_text'].str.extract(r'(__label__\d+)')[0]
test_data['text'] = test_data['raw_text'].str.replace(r'__label__\d+ ', '', regex=True)

# Dropping the raw text feature and replacing the labels with positive and negative label
train_data.drop(['raw_text'],axis=1,inplace=True)
test_data.drop(['raw_text'],axis=1,inplace=True)

train_data['label'] = train_data['label'].replace({'__label__2': 'positive', '__label__1': 'negative'})
test_data['label'] = test_data['label'].replace({'__label__2': 'positive', '__label__1': 'negative'})

In [19]:
def preprocess_text(text):
    """
    Preprocess the input text: lowercasing, punctuation removal,
    stopword removal, and lemmatization.
    """
    doc = nlp(text)

    # Lowercase, remove punctuation, filter stopwords, and lemmatize using spaCy
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop]
    processed_text = ' '.join(tokens)

    return embedding_model.encode(processed_text)

# Define the target transform for one-hot encoding
def target_transform(label):
    num_classes = 2  # Number of classes: 'positive' and 'negative'
    return one_hot(torch.tensor(label), num_classes=num_classes).float()

# TextDataset Class
class TextDataset(Dataset):
    def __init__(self, df, text_column, label_column, transform=None, target_transform=None):
        """
        Args:
            df (pd.DataFrame): Input DataFrame containing text and labels.
            text_column (str): Name of the column containing text data.
            label_column (str): Name of the column containing labels.
            transform (callable, optional): Transformation for text data (e.g., preprocessing).
            target_transform (callable, optional): Transformation for target labels.
        """
        self.data = df
        self.texts = self.data[text_column].tolist()
        self.labels = self.data[label_column].tolist()
        self.transform = transform
        self.target_transform = target_transform

        # Convert textual labels (e.g., 'positive', 'negative') to integers
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(self.labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.encoded_labels[idx]

        # Apply text preprocessing if specified
        if self.transform:
            text = self.transform(text)

        # Apply target transformation (e.g., one-hot encoding) if specified
        if self.target_transform:
            label = self.target_transform(label)

        return text, torch.tensor(label, dtype=torch.float32)

In [20]:
%%time
# Create a TextDataset instance
train_dataset = TextDataset(
    df=train_data,
    text_column="text",
    label_column="label",
    transform=preprocess_text,
    target_transform=target_transform
)

test_dataset = TextDataset(
    df=test_data,
    text_column="text",
    label_column="label",
    transform=preprocess_text,
    target_transform=target_transform
)

# Wrap the dataset in a DataLoader for batching
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

CPU times: user 5.45 ms, sys: 3.3 ms, total: 8.75 ms
Wall time: 9.11 ms
