<a href="https://colab.research.google.com/github/wahid028/Sentiment-Analysis/blob/main/Dataset_%26_DataLoader_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q tensorboard

In [37]:
#import the necessary libraries

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

In [4]:
#import the dataset

#install kaggle
!pip install -q kaggle

#upload the kaggle.json file
from google.colab import files
files.upload()

#create a kaggle directory
!mkdir ~/.kaggle

#copy the kaggle.json to kaggle directory
!cp kaggle.json ~/.kaggle/

#permission for the json to act
!chmod 600 ~/.kaggle/kaggle.json

#download kaggle datasetlist
! kaggle datasets list

Saving kaggle.json to kaggle.json
ref                                                             title                                               size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
themrityunjaypathak/covid-cases-and-deaths-worldwide            Covid Cases and Deaths WorldWide                     8KB  2023-02-01 12:22:51           2170         56  1.0              
ahsan81/hotel-reservations-classification-dataset               Hotel Reservations Dataset                         480KB  2023-01-04 12:50:31          11518        371  1.0              
datascientistanna/customers-dataset                             Shop Customer Data                                  23KB  2023-02-07 18:42:21           1782         51  1.0              
themrityunjaypathak/most-subscr

In [5]:
# copy API command link to download the dataset if it is not any comtition data
!kaggle competitions download -c tweet-sentiment-extraction

Downloading tweet-sentiment-extraction.zip to /content
 72% 1.00M/1.39M [00:00<00:00, 2.05MB/s]
100% 1.39M/1.39M [00:00<00:00, 2.66MB/s]


In [6]:
!unzip tweet-sentiment-extraction.zip

Archive:  tweet-sentiment-extraction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [7]:
df = pd.read_csv('../content/train.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [8]:
# drop the rows with neutral sentiment as we are only interested in positive and negative sentiment
df = df[df['sentiment'] != 'neutral']
df = df.reset_index(drop=True)   

In [9]:
#convert negative to 0 and positive to 1
df['label'] = df['sentiment'].map({'negative':0, 'positive':1})

In [10]:
#re-create the dataset keep the target columns only
df = df[['text', 'label']]

In [11]:
#check the properties of the dataset
df.isnull().sum()

text     0
label    0
dtype: int64

In [12]:
df.shape

(16363, 2)

In [13]:
X = df["text"]
y = df["label"]

In [14]:
# split train dataset into train, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=100, test_size=0.3, stratify=y)
# split validation dataset into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, random_state=100, test_size=0.5, stratify=y_temp)

In [15]:
print("Train features:", X_train.shape)
print("Train target:",  y_train.shape)
print("Validation features:",  X_val.shape)
print("Validation target:",  y_val.shape)
print("Test features:",  X_test.shape)
print("Test target:",  y_test.shape)

Train features: (11454,)
Train target: (11454,)
Validation features: (2454,)
Validation target: (2454,)
Test features: (2455,)
Test target: (2455,)


In [16]:
Model_Name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(Model_Name)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [17]:
class CustomData(Dataset):
  def __init__(self, input_text, labels, tokenizer: BertTokenizer, max_len):
    self.input_text = input_text
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.input_text)

  def __getitem__(self, idx):
    review = str(self.input_text[idx])
    targets = self.labels[idx]

    encoder = self.tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return {
        "tweets": review,
        "input_ids":encoder['input_ids'].flatten(),
        "attention_mask": encoder['attention_mask'].flatten(),
        "target": torch.tensor(targets, dtype=torch.long)
    }

In [18]:
train_ds = CustomData(X_train, y_train, tokenizer, max_len=128)
val_ds = CustomData(X_val, y_val, tokenizer, max_len=128)
test_ds = CustomData(X_test, y_test, tokenizer, max_len=128)

In [19]:
train_ds[5]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'tweets': ' Journey!? Wow... u just became cooler.  hehe... (is that possible!?)',
 'input_ids': tensor([  101, 12015,   106,   136, 11750,   119,   119,   119,   190,  1198,
          1245, 16314,   119,  1119,  4638,   119,   119,   119,   113,  1110,
          1115,  1936,   106,   136,   114,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     

In [20]:
for item in train_ds:
  print(item['tweets'])
  print(item['input_ids'])
  print(item['target'])
  break

 Sooo SAD I will miss you here in San Diego!!!
tensor([  101, 27972,  1186, 13411,  2137,   146,  1209,  5529,  1128,  1303,
         1107,  1727,  4494,   106,   106,   106,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   

In [21]:
train_dl = DataLoader(train_ds, batch_size=2, shuffle=True, num_workers=2)
val_dl = DataLoader(val_ds, batch_size=2, shuffle=False, num_workers=2)
test_dl = DataLoader(test_ds, batch_size=2, shuffle=False, num_workers=2)