In [1]:
import pandas as pd # read file csv..
import numpy as np # import numpy package for linear algbra
import re # regular expressions pakage,use for text processing
from string import punctuation # to remove punctations
import nltk # pakage use for nutural language
from nltk.corpus import stopwords # package to handel stop words
from nltk.tokenize import word_tokenize
import sys # pakage used for handel the recursion limit
from nltk.stem import PorterStemmer # import pakage for sttemming
from sklearn.model_selection import train_test_split  # split data 
import torch  # Importing PyTorch, a deep learning framework
import torch.nn as nn  # Importing PyTorch's neural network module
import math  # Importing math for mathematical operations
import torchtext  # Importing torchtext for text processing 
from torch.utils.data import DataLoader, TensorDataset  # Importing DataLoader and TensorDataset
from torchtext.data.utils import get_tokenizer  # Importing get_tokenizer 
from sklearn.metrics import accuracy_score  # to calculat accuarcy

## bulid model :

![Example Image](https://www.researchgate.net/profile/Joe-Meyer/publication/346737150/figure/fig5/AS:966593613938689@1607465284958/Transformer-Network-Vaswani-et-al-2017-This-particular-model-presented-in-the.ppm)

- A Transformer-based text classification model is a deep learning architecture that excels at processing and understanding sequential data like text. It uses self-attention mechanisms to capture dependencies between words in a sentence, enabling it to learn contextual information effectively. The model takes in a sequence of words as input and outputs a classification label, making it suitable for tasks such as sentiment analysis, spam detection, and language translation. Transformers have become a state-of-the-art choice for text classification due to their ability to capture long-range dependencies and their parallel processing capabilities, making them highly efficient for natural language understanding tasks.

###  Class ScaledDotProductAttention

In [2]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k  # Dimension of each head's key, query, and value
    
    def forward(self, query, key, value, mask=None):
        # Compute attention scores
        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9) 
        attn_probs = torch.softmax(attn_scores, dim=-1) # use softmax to get attention probabilities
        output = torch.matmul(attn_probs, value) # Multiply probabilities by value tensor
        return output

- It takes input query, key, and value tensors, calculates attention scores, applies optional masking, computes attention probabilities using softmax, and produces an output representing the attended information from the input. The module is widely used in natural language processing and sequence-to-sequence tasks for tasks like machine translation, text generation, and more.

### class PositionWiseFeedForward

In [3]:
class PositionWiseFeedForward(nn.Module):   
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        # First linear layer
        self.layer1 = nn.Linear(d_model, d_ff) 
         # Second linear layer  
        self.layer2 = nn.Linear(d_ff, d_model) 
        # ReLU activation
        self.relu = nn.ReLU() 
       
    def forward(self, x):        
        hidden = self.relu(self.layer1(x)) # Apply ReLU on first linear layer output
        output = self.layer2(hidden)       # Second linear layer 
        return output

- This code class called PositionWiseFeedForward used in transformer-based models. It performs a position-wise feedforward transformation on input tensors, allowing the model to capture complex patterns. It consists of two linear layers with a ReLU activation function in between, transforming the input tensor from a lower-dimensional space (d_model) to a higher-dimensional space (d_ff) and then back to the original dimension (d_model)

### class LinearLayers

In [4]:
class LinearLayers(nn.Module):
    def __init__(self, d_model):
        super(LinearLayers, self).__init__()
        # Linear transformations for query, key, value, and output
        self.query_transform = nn.Linear(d_model, d_model)
        self.key_transform = nn.Linear(d_model, d_model)
        self.value_transform = nn.Linear(d_model, d_model)
        self.output_transform = nn.Linear(d_model, d_model)

- This class called LinearLayers for linear transformations used in self-attention mechanisms within transformer-based models.

### class MultiHeadAttention

In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        
        # Ensure that the model dimension is divisible by the number of attention heads
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        # Store the model dimension and the number of attention heads
        self.d_model = d_model            
        self.num_heads = num_heads        
        
        # Calculate the dimension of each head's key, query, and value
        self.d_k = d_model // num_heads  
        
        # Initialize linear transformations 
        self.linear_layers = LinearLayers(d_model)  
        
        # Initialize the Scaled Dot-Product Attention mechanism
        self.attention = ScaledDotProductAttention(self.d_k)
    
    def split_heads(self, x):
        batch_size, seq_length, _ = x.size()
        
        # Reshape the input tensor to create multiple heads
        x = x.view(batch_size, seq_length, self.num_heads, self.d_k)
        
        # Transpose dimensions 
        return x.transpose(1, 2)
    
    def combine_heads(self, x):
        batch_size, _, seq_length, _ = x.size()
        
        # Transpose dimensions
        x = x.transpose(1, 2).contiguous()
        
        # Reshape the combined tensor
        return x.view(batch_size, seq_length, self.d_model)
    
    def forward(self, query, key, value, mask=None):
        # Apply linear transformations 
        query = self.split_heads(self.linear_layers.query_transform(query))
        key = self.split_heads(self.linear_layers.key_transform(key))
        value = self.split_heads(self.linear_layers.value_transform(value))
        
        # Calculate attention scores 
        attn_output = self.attention(query, key, value, mask)
        
        # Combine the attention outputs 
        output = self.linear_layers.output_transform(self.combine_heads(attn_output))
        
        return output

- This class called MultiHeadAttention for implementing multi-head self-attention, a crucial component in transformer-based models. It takes query, key, and value tensors as input, applies linear transformations to these tensors to project them into multiple heads,computes scaled dot-product attention for each head, and then combines and projects the results back to the original dimension.This enables the model to simultaneously focus on different parts of the input sequence, capturing complex patterns and relationships.

### class EncoderLayer

In [6]:
class EncoderLayer(nn.Module):    
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()    
        # Self-attention mechanism
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        # Position-wise feed-forward neural network
        self.positionwise_feed_forward = PositionWiseFeedForward(d_model, d_ff)
        # Layer normalization for the first sub-layer
        self.norm1 = nn.LayerNorm(d_model)
        # Layer normalization for the Second sub-layer
        self.norm2 = nn.LayerNorm(d_model)  
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):   
        # Self-attention and residual connection
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)   
        # Layer normalization
        x = self.norm1(x)
        # Position-wise feed-forward neural network and residual connection
        ffn_output = self.positionwise_feed_forward(x)        
        x = x + self.dropout(ffn_output)
        # Layer normalization
        x = self.norm2(x)        
        return x

- This code defines an EncoderLayer class for use in a Transformer-based neural network. An EncoderLayer represents one layer within the encoder stack of a Transformer model. It consists of the following components:

1. Self-Attention Mechanism: The layer includes a self-attention mechanism (implemented using nn.MultiheadAttention) that calculates attention scores for the input sequence, allowing the model to focus on different parts of the input sequence.

2. Position-Wise Feed-Forward Network: After the self-attention step, the layer applies a position-wise feed-forward neural network to the output of the self-attention step. This step introduces non-linearity and is followed by residual connections.

3. Layer Normalization: Layer normalization is applied both after the self-attention step (self.norm1) and after the position-wise feed-forward step (self.norm2). Layer normalization helps stabilize training by ensuring that input values to each layer have similar mean and variance.

4. Dropout: Dropout is applied for regularization purposes after both the self-attention step and the position-wise feed-forward step.

### class Encoder

In [7]:
class Encoder(nn.Module):    
    def __init__(self, d_model, max_seq_length, dropout):
        super(Encoder, self).__init__()        
        self.dropout = nn.Dropout(dropout)
        # Calculate positional encodings
        position_encoding = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)  
        # Calculate div_term used in the positional encodings
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        # Calculate sine and cosine positional encodings
        position_encoding[:, 0::2] = torch.sin(position * div_term)       
        position_encoding[:, 1::2] = torch.cos(position * div_term)
         # Register the positional encodings as a buffer
        self.register_buffer('position_encoding', position_encoding.unsqueeze(0))     
    def forward(self, x):
        seq_length = x.size(1)  
        x = x + self.position_encoding[:, :seq_length, :]    # Add positional encodings to the input
        x = self.dropout(x)        # Apply dropout
        return x

- This code defines an Encoder module for a Transformer-based neural network. Its primary purpose is to process input sequences by adding positional encodings and applying dropout. Here's a concise summary:

1. Initialization ( init method):
   - Initializes the Encoder module with parameters like hidden dimension (d_model), maximum sequence length (max_seq_length), and dropout rate (dropout).
   - Computes positional encodings for input sequences, which represent the order of tokens within the sequence and registers them as a buffer.

2. Forward Pass (forward method):
   - Takes an input tensor x of shape (batch_size, seq_length, d_model).
   - Adds positional encodings to the input tensor to convey positional information.
   - Applies dropout to the modified tensor for regularization.
   - Returns the modified tensor as the output.


###  class DecoderLayer

In [8]:
class DecoderLayer(nn.Module):    
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()    
        # Self-attention layer
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout)
        # Encoder-decoder attention layer
        self.enc_dec_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout) 
        # Position-wise feed-forward neural network
        self.positionwise_feed_forward = PositionWiseFeedForward(d_model, d_ff)
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)        
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)  
        # Dropout
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, enc_output):   
        # Self-attention
        attn_output, _ = self.self_attn(x, x, x)
        x = x + self.dropout(attn_output)        
        x = self.norm1(x)
        # Encoder-decoder attention
        enc_dec_attn_output, _ = self.enc_dec_attn(x, enc_output, enc_output)        
        x = x + self.dropout(enc_dec_attn_output)
        x = self.norm2(x)  
        # Position-wise feed-forward neural network
        ffn_output = self.positionwise_feed_forward(x)
        x = x + self.dropout(ffn_output)        
        x = self.norm3(x)
        return x

- This code defines a DecoderLayer for a Transformer model. It processes input through self-attention, encoder-decoder attention, and a feed-forward network with residual connections and normalization. Multiple layers of this module are stacked to form the Transformer's decoder.

### class TransformerPredictionToxic

In [9]:
class TransformerPredictionToxic(nn.Module):    
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, dropout, max_seq_length):        
        super(TransformerPredictionToxic, self).__init__()    
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)
        # Positional encoding layer
        self.positional_encoding = Encoder(d_model, max_seq_length, dropout)   
        # Encoder layers
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        # Decoder layers
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])   
        # Fully connected layer for prediction
        self.fc = nn.Linear(d_model, 2)  # 2 classes: toxic or not toxic
        self.dropout = nn.Dropout(dropout)  # Dropout
    def forward(self, x):  
        # Embedding and positional encoding
        x = self.embedding(x)
        x = self.positional_encoding(x)  
        # Encoder layers
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)        
        # Decoder layers    
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x, x)      
        # Global average pooling    
        x = torch.mean(x, dim=1)
        x = self.fc(x)    # Fully connected layer for prediction    
        return x

- This code defines a Transformer-based neural network model named TransformerPredictionToxic for binary text classification, specifically for identifying whether a text is toxic or not. The model takes text as input, applies embeddings and positional encodings, processes it through encoder and decoder layers, and produces a toxicity prediction using a fully connected layer.

## read data and clean

- read data and apply clean to handel data

In [10]:
# read data by read_csv
df = pd.read_csv(r"D:\The University\Level 2\tranning\sprints\6\0\train.csv")
# show all data
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


- all col int but comment_text is obj

In [12]:
# show missing data
df.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [13]:
# show duplicated data
df.duplicated().sum()

0

- no missing and duplicated

## clean

In [14]:
# Function to clean col comment_text 
def clean_text(text):
    # create lower case
    text = text.lower()
    # remove url
    text = re.sub(r'http\S+', '', text)
    # remove special characters and punctuation
    text = re.sub(r'[^\w\s\d]', ' ', text)
    # handle problem as (can't ,didn't , it's)
    text = re.sub(r"([a-zA-Z]+)'([a-zA-Z]+)", r"\1 \2", text)
    return text

In [15]:
# Apply function on col comment_text
df['comment_text_handel'] = df['comment_text'].apply(clean_text)
# show col comment_text_handel
df['comment_text_handel']

0         explanation\nwhy the edits made under my usern...
1         d aww  he matches this background colour i m s...
2         hey man  i m really not trying to edit war  it...
3          \nmore\ni can t make any real suggestions on ...
4         you  sir  are my hero  any chance you remember...
                                ...                        
159566          and for the second time of asking  when ...
159567    you should be ashamed of yourself \n\nthat is ...
159568    spitzer \n\numm  theres no actual article for ...
159569    and it looks like it was actually you who put ...
159570     \nand     i really don t think you understand...
Name: comment_text_handel, Length: 159571, dtype: object

## stop words

In [16]:
nltk.download('stopwords') # download stopwords becuase not found
# Create a set of stop words
stop_words = set(stopwords.words('english'))
#Function to remove stop words comment_text col
def remove_stopwords_text(text):
  #split text for applay stop words
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
#Apply function on comment_text_handel
df['comment_text_handel'] = df['comment_text_handel'].apply(remove_stopwords_text)
#show col comment_text_handel
df['comment_text_handel']

0         explanation edits made username hardcore metal...
1         aww matches background colour seemingly stuck ...
2         hey man really trying edit war guy constantly ...
3         make real suggestions improvement wondered sec...
4                             sir hero chance remember page
                                ...                        
159566    second time asking view completely contradicts...
159567    ashamed horrible thing put talk page 128 61 19 93
159568    spitzer umm theres actual article prostitution...
159569    looks like actually put speedy first version d...
159570    really think understand came idea bad right aw...
Name: comment_text_handel, Length: 159571, dtype: object

- now col is clean(no punctuation no stopwords) applay tokinzation to apply train model easy

## toknization

In [18]:
nltk.download('punkt')
# Function to tokenize comment_text col
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [19]:
# Apply function on comment_text_handel
df['comment_text_handel'] = df['comment_text_handel'].apply(tokenize_text)
# show col comment_text_handel
df['comment_text_handel']

0         [explanation, edits, made, username, hardcore,...
1         [aww, matches, background, colour, seemingly, ...
2         [hey, man, really, trying, edit, war, guy, con...
3         [make, real, suggestions, improvement, wondere...
4                       [sir, hero, chance, remember, page]
                                ...                        
159566    [second, time, asking, view, completely, contr...
159567    [ashamed, horrible, thing, put, talk, page, 12...
159568    [spitzer, umm, theres, actual, article, prosti...
159569    [looks, like, actually, put, speedy, first, ve...
159570    [really, think, understand, came, idea, bad, r...
Name: comment_text_handel, Length: 159571, dtype: object

- apply toknization data is ready to stemming

## stemming

In [20]:
# handel the recursion limit
sys.setrecursionlimit(10**6)
# Initialize the Porter Stemmer
stemmer = PorterStemmer()
# Function to apply stemming to a list of words
def stemmming_text(words):
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [21]:
# Apply function on comment_text_handel
df['comment_text_handel_2'] = df['comment_text_handel'].apply(stemmming_text)
# show col comment_text_handel
df['comment_text_handel_2']

0         explan edit made usernam hardcor metallica fan...
1         aww match background colour seemingli stuck th...
2         hey man realli tri edit war guy constantli rem...
3         make real suggest improv wonder section statis...
4                                sir hero chanc rememb page
                                ...                        
159566    second time ask view complet contradict covera...
159567       asham horribl thing put talk page 128 61 19 93
159568    spitzer umm there actual articl prostitut ring...
159569    look like actual put speedi first version dele...
159570    realli think understand came idea bad right aw...
Name: comment_text_handel_2, Length: 159571, dtype: object

# copy

In [22]:
# copy data to be esay for handel code
data = df.copy()
#show data
data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_handel,comment_text_handel_2
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, edits, made, username, hardcore,...",explan edit made usernam hardcor metallica fan...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[aww, matches, background, colour, seemingly, ...",aww match background colour seemingli stuck th...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, really, trying, edit, war, guy, con...",hey man realli tri edit war guy constantli rem...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[make, real, suggestions, improvement, wondere...",make real suggest improv wonder section statis...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[sir, hero, chance, remember, page]",sir hero chanc rememb page
...,...,...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0,"[second, time, asking, view, completely, contr...",second time ask view complet contradict covera...
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0,"[ashamed, horrible, thing, put, talk, page, 12...",asham horribl thing put talk page 128 61 19 93
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0,"[spitzer, umm, theres, actual, article, prosti...",spitzer umm there actual articl prostitut ring...
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0,"[looks, like, actually, put, speedy, first, ve...",look like actual put speedi first version dele...


- copy data to be easy acsses code

### create sample 

In [23]:
# Calculate the size of the sample (at least 40% of the original data)
#sample_size = max(len(data) * 4 // 100, 1)   #  ***not working on my pc
sample_size = max(len(data) * 1 // 100, 1)
# Randomly select rows to create the sample DataFrame
sample_df = data.sample(n=sample_size)

sample_df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_handel,comment_text_handel_2
26556,465ca8af0afcd303,You seem to be trying to push a certain POV on...,0,0,0,0,0,0,"[seem, trying, push, certain, pov, article, pl...",seem tri push certain pov articl pleas see wp ...
126994,a72a66073995bbab,Why would Emma be dressed as the pagan greek g...,0,0,0,0,0,0,"[would, emma, dressed, pagan, greek, goddess, ...",would emma dress pagan greek goddess cybel
99205,12a8bb9ac50c72ee,"""\nI'd say Zivo Blato is not heavy metal but h...",0,0,1,0,0,0,"[say, zivo, blato, heavy, metal, hard, rock, a...",say zivo blato heavi metal hard rock although ...
33426,59223dc85dfaa2ce,"""\nI think you mean delete, not delite. /Mid (...",0,0,0,0,0,0,"[think, mean, delete, delite, mid, contributions]",think mean delet delit mid contribut
77559,cfcb3cb0ab661f6b,Thanks for quietly indicating I wasn't paying ...,0,0,0,0,0,0,"[thanks, quietly, indicating, paying, proper, ...",thank quietli indic pay proper attent exchang ...
...,...,...,...,...,...,...,...,...,...,...
127526,aa0df73f3c66be2c,"""\n\nPlease do not be condescending and uncivi...",0,0,0,0,0,0,"[please, condescending, uncivil, comments, lik...",pleas condescend uncivil comment like sure dut...
124496,9a026cf66647da22,Mr. Break Grant Talk Page,0,0,0,0,0,0,"[mr, break, grant, talk, page]",mr break grant talk page
23034,3ce643b1a8648906,"""\nAs an internationalist who wants nothing mo...",0,0,0,0,0,0,"[internationalist, wants, nothing, equal, chan...",internationalist want noth equal chanc everyon...
40650,6c80d1b1752f00e9,"""\nThanks, I see that the permission has been ...",0,0,0,0,0,0,"[thanks, see, permission, handled, talk]",thank see permiss handl talk


- select 40% from data to apply train model but is not work on my pc then i select 1 % from data

### function numericalize_with_padding

In [24]:
# Define the maximum sequence 
max_seq_length = 100  

# Function to numericalize tokens with padding
def numericalize_with_padding(tokens):
    numericalized = [vocab[token] for token in tokens]  
    padded = numericalized[:max_seq_length] + [0] * (max_seq_length - len(numericalized))  # padding the squence
    return torch.tensor(padded)
# Tokenize and stem the comments
tokenized_comments = sample_df["comment_text_handel"]
# Build vocabulary from tokenized comments
vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_comments)
vocab_size = len(vocab)

- create this function to change comment_text_handel from text to number to apply model

###  split data 

In [25]:
# Convert tokenized comments to numerical format with padding 
numericalized_comments = [numericalize_with_padding(tokens) for tokens in tokenized_comments]
# Prepare data for training and evaluation
labels = sample_df['toxic']
# split data into (train and test) data
X_train, X_eval, y_train, y_eval = train_test_split(numericalized_comments, labels, test_size = 0.2, train_size= 0.4, stratify = sample_df['toxic'], random_state=42)
# Convert the training and evaluation data to PyTorch tensors
train_dataset = TensorDataset(torch.stack(X_train), torch.tensor(y_train.to_numpy()))
test_dataset = TensorDataset(torch.stack(X_eval), torch.tensor(y_eval.to_numpy()))

- Now split data is ready to apply train model 

### train model

In [26]:
# Hyperparameters for Transformer model

# Model dimensionality
d_model = 256  
# Number of attention heads
num_heads = 4 
# Number of encoder and decoder layers
num_layers = 4    
# Dimension of the feed-forward network
d_ff = 512   
# Dropout rate for regularization
dropout = 0.2      
# Batch size for training
batch_size = 64  
# Learning rate for optimizer
learning_rate = 0.001 
# Number of training epochs
num_epochs = 3      

# Build the Transformer model
model = TransformerPredictionToxic(vocab_size, d_model, num_heads, num_layers, d_ff, dropout, max_seq_length)
# Use Adam optimizer for training
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Define the loss function (CrossEntropyLoss for classification tasks)
criterion = nn.CrossEntropyLoss()

# create train_loader to bulid in to train
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0.0  # Accumulator for epoch loss
    correct_predictions = 0  # Counter for correct predictions
    total_predictions = 0  # Counter for total predictions
    
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()  

        # Calculate accuracy for this batch
        _, predicted = torch.max(outputs, 1)
        correct_predictions += (predicted == targets).sum().item()
        total_predictions += targets.size(0)

    average_epoch_loss = epoch_loss / len(train_loader)  # Calculate average loss for the epoch
    accuracy = correct_predictions / total_predictions  # Calculate accuracy for the epoch

    print(f"Epoch: {epoch + 1}, Loss: {average_epoch_loss:.4f}, Accuracy: {accuracy * 100:.2f}%")

Epoch: 1, Loss: 0.5475, Accuracy: 83.70%
Epoch: 2, Loss: 0.3238, Accuracy: 90.60%
Epoch: 3, Loss: 0.3221, Accuracy: 90.60%


### test model 

In [27]:
# Define your test DataLoader (assuming you have a test_dataset)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Put the model in evaluation mode
model.eval()

# Variables to keep track of correct and total predictions
correct_predictions = 0
total_predictions = 0

# Iterate through the test data and calculate accuracy
for inputs, targets in test_loader:
    with torch.no_grad():  # Disable gradient computation during evaluation
        outputs = model(inputs)
    
    _, predicted = torch.max(outputs, 1)
    correct_predictions += (predicted == targets).sum().item()
    total_predictions += targets.size(0)

# Calculate accuracy
accuracy = correct_predictions / total_predictions
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 90.60%


#### the accuacy of train near accuarcy test but the sample 40% and more epoch as 50 not work in my pc 