**Setup** - Loading datasets in

In [6]:
import pandas as pd
import zipfile
import os
import torch

# Path to the zip file
zip_file_path = 'C:/Users/conno/Desktop/Coding/train.csv.zip'  # Update this to your zip file's path

# Directory where the zip file will be extracted
extraction_path = 'C:/Users/conno/Desktop/Coding/'  # Update this to your desired extraction directory

# Extracting the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

# Path to the extracted CSV file
csv_file_path = os.path.join(extraction_path, 'train.csv')

# Load the CSV file into a Pandas DataFrame
train_df = pd.read_csv(csv_file_path)

# Display the first few rows of the DataFrame
print(train_df.head())


             id  discourse_id  discourse_start  discourse_end  \
0  423A1CA112E2  1.622628e+12              8.0          229.0   
1  423A1CA112E2  1.622628e+12            230.0          312.0   
2  423A1CA112E2  1.622628e+12            313.0          401.0   
3  423A1CA112E2  1.622628e+12            402.0          758.0   
4  423A1CA112E2  1.622628e+12            759.0          886.0   

                                      discourse_text discourse_type  \
0  Modern humans today are always on their phone....           Lead   
1  They are some really bad consequences when stu...       Position   
2  Some certain areas in the United States ban ph...       Evidence   
3  When people have phones, they know about certa...       Evidence   
4  Driving is one of the way how to get around. P...          Claim   

  discourse_type_num                                   predictionstring  
0             Lead 1  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...  
1         Position 1       45 46 4

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


This code snippet performs the necessary preprocessing on the discourse_text column and encodes the discourse_type into a numerical format. Make sure you have the required NLTK packages 

In [12]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.preprocessing import LabelEncoder

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    """Clean the text by removing special characters and extra spaces."""
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def tokenize_and_normalize(text):
    """Tokenize and normalize the text."""
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    normalized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(normalized_tokens)

# Assuming train_df is your DataFrame

# Clean and normalize the discourse text
train_df['cleaned_discourse_text'] = train_df['discourse_text'].apply(lambda x: tokenize_and_normalize(clean_text(x)))

# Encode the discourse types
label_encoder = LabelEncoder()
train_df['discourse_type_encoded'] = label_encoder.fit_transform(train_df['discourse_type'])

# Display the first few rows of the DataFrame after preprocessing
print(train_df[['discourse_text', 'cleaned_discourse_text', 'discourse_type', 'discourse_type_encoded']].head())



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\conno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\conno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\conno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                      discourse_text  \
0  Modern humans today are always on their phone....   
1  They are some really bad consequences when stu...   
2  Some certain areas in the United States ban ph...   
3  When people have phones, they know about certa...   
4  Driving is one of the way how to get around. P...   

                              cleaned_discourse_text discourse_type  \
0  modern human today always phone always phone 5...           Lead   
1    really bad consequence stuff happens come phone       Position   
2     certain area united state ban phone class room       Evidence   
3  people phone know certain apps apps like faceb...       Evidence   
4  driving one way get around people always phone...          Claim   

   discourse_type_encoded  
0                       4  
1                       5  
2                       3  
3                       3  
4                       0  


In [13]:
# Display the first few rows of the DataFrame after preprocessing
print(train_df.head())

             id  discourse_id  discourse_start  discourse_end  \
0  423A1CA112E2  1.622628e+12              8.0          229.0   
1  423A1CA112E2  1.622628e+12            230.0          312.0   
2  423A1CA112E2  1.622628e+12            313.0          401.0   
3  423A1CA112E2  1.622628e+12            402.0          758.0   
4  423A1CA112E2  1.622628e+12            759.0          886.0   

                                      discourse_text discourse_type  \
0  Modern humans today are always on their phone....           Lead   
1  They are some really bad consequences when stu...       Position   
2  Some certain areas in the United States ban ph...       Evidence   
3  When people have phones, they know about certa...       Evidence   
4  Driving is one of the way how to get around. P...          Claim   

  discourse_type_num                                   predictionstring  \
0             Lead 1  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...   
1         Position 1       45 46

Input IDs (tensor([[0, 30117, 278, ...]])):

These are the token IDs for each word in your input text. The tokenizer converts words to their corresponding IDs based on RoBERTa's vocabulary.
The 0 at the start of each sequence is the special token [CLS] used by RoBERTa.
Attention Masks (tensor([[1, 1, 1, ..., 0, 0, 0]])):

These masks tell the model which tokens should be paid attention to and which should be ignored.
1 indicates a real token, and 0 is used for padding tokens. This is important because your input sequences are padded to a uniform length (MAX_LEN).
Labels (tensor([3, 3, 4, 2, 0, 0, 0, 3, 5, 0, 3, 0, 0, 5, 0, 3])):

These are the encoded labels for each text in your batch. They correspond to the discourse_type_encoded values of your dataset.

In [15]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader


# Assuming your DataFrame is named 'train_df'
# and it includes 'cleaned_discourse_text' and 'discourse_type_encoded' columns

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Parameters
MAX_LEN = 128  # Maximum length of the tokens list. Adjust as needed.
BATCH_SIZE = 32  # Adjust based on your GPU capacity

# Create dataset and dataloader
dataset = CustomDataset(
    texts=train_df['cleaned_discourse_text'].to_numpy(),
    labels=train_df['discourse_type_encoded'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

data_loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4  # Adjust based on your machine
)

# Example of iterating over the DataLoader
for data in data_loader:
    print(data['input_ids'], data['attention_mask'], data['labels'])
    break  # Remove this line to iterate over the whole dataset

  from .autonotebook import tqdm as notebook_tqdm


Step 1: Loading the Model and Setting Up Training

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['cleaned_discourse_text'], train_df['discourse_type_encoded'],
    test_size=0.1  # 10% for validation
)


# Tokenize the texts for training and validation sets
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

# Create a Dataset class for RoBERTa
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets for training and validation
train_dataset = Dataset(train_encodings, train_labels.tolist())
val_dataset = Dataset(val_encodings, val_labels.tolist())

# Load RoBERTa pre-trained model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(train_df['discourse_type_encoded']))).to(device)


# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    fp16=True,
    gradient_accumulation_steps=2,
    max_steps=100
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train the model
trainer.train()

  0%|          | 1/24351 [03:09<1280:13:03, 189.27s/it]

KeyboardInterrupt: 

In [1]:
import torch
print(torch.cuda.is_available())


True
