In [None]:
import os

# Set the directory path
directory = "/Users/shaistasyeda/Desktop/DataSet/TextFiles"

# Create a new file to store the merged contents
for year in range(1803, 2007):
    output_file_path = f"{year}-merged.txt"

    with open(output_file_path, "w") as output_file:
        # Loop through all files in the directory
        for filename in os.listdir(directory):
            # Check if the filename contains the year
            if str(year) in filename:
                # Open the file and read the contents
                with open(os.path.join(directory, filename), "r") as file:
                    contents = file.read()
                    # Write the contents to the merged file
                    output_file.write(contents)

    print(f'Merged data for {year} and saved to {output_file_path}')

In [1]:
import os
import re
import pandas as pd
from transformers import BertTokenizer

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Path to data files
data_directory = '/Users/shaistasyeda/Desktop/DataSet/Merged-files'

In [None]:
# Function to tokenize and format data
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data = file.readlines()

    # Extract year from the file name using regular expression
    year_match = re.search(r'(\d{4})-merged\.txt', file_path)
    if year_match:
        year = int(year_match.group(1))
    else:
        raise ValueError(f"Could not extract the year from the file name: {file_path}")

    # Tokenize and format data for BERT
    tokenized_data = []
    for text in text_data:
        tokenized_input = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
        tokenized_data.append({
            'text': text.strip(),  # Add the actual text to the DataFrame
            'input_ids': tokenized_input['input_ids'].squeeze().tolist(),
            'attention_mask': tokenized_input['attention_mask'].squeeze().tolist(),
            'year': year
        })

    return tokenized_data

In [None]:
# Process all files in the data directory
all_data = []
for filename in os.listdir(data_directory):
    if filename.endswith("-merged.txt"):
        file_path = os.path.join(data_directory, filename)
        tokenized_data = process_file(file_path)
        all_data.extend(tokenized_data)
        # Delete the original file
        os.remove(file_path)


In [None]:
# Convert the tokenized data to a Pandas DataFrame
df = pd.DataFrame(all_data)

In [None]:
# Save the DataFrame 
df.to_parquet('/Users/shaistasyeda/Desktop/DataSet/tokenized_data_1.parquet')

In [2]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split

In [3]:
# Load Parquet DataFrame
df = pd.read_parquet('/Users/shaistasyeda/Desktop/DataSet/tokenized_data_1.parquet')

In [4]:
# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
                                                      text  \
0        acquainted the House that their Address of Wed...   
1        (Answered by Mr. A. J. Balfour.) I understand ...   
2        (Answered by Mr. A. J. Balfour.) I am afraid I...   
3        (Answered by Mr. Walter Long.) No opportunity ...   
4        (Answered by Mr. Bonar Law.) The value of the ...   
...                                                    ...   
2282429  is the appropriate heading to which they shoul...   
2282430  from interest allowable and interest disallowa...   
2282431  premium bonds, for which there is a fixed limi...   
2282432  respect of the financial year 1966–67, the num...   
2282433  the Order, was to stop an unnecessary strain o...   

                                                 input_ids  \
0        [101, 19056, 1996, 2160, 2008, 2037, 4769, 199...   
1        [101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...   
2        [101, 1006, 4660, 2011, 2720, 1012, 1037

In [5]:
# Function to apply the labeling
def label_year(year):
    if 1850 <= year <= 1920:
        return 0
    else:
        return 1

In [6]:
df['label'] = df['year'].apply(label_year)
print(df)

                                                      text  \
0        acquainted the House that their Address of Wed...   
1        (Answered by Mr. A. J. Balfour.) I understand ...   
2        (Answered by Mr. A. J. Balfour.) I am afraid I...   
3        (Answered by Mr. Walter Long.) No opportunity ...   
4        (Answered by Mr. Bonar Law.) The value of the ...   
...                                                    ...   
2282429  is the appropriate heading to which they shoul...   
2282430  from interest allowable and interest disallowa...   
2282431  premium bonds, for which there is a fixed limi...   
2282432  respect of the financial year 1966–67, the num...   
2282433  the Order, was to stop an unnecessary strain o...   

                                                 input_ids  \
0        [101, 19056, 1996, 2160, 2008, 2037, 4769, 199...   
1        [101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...   
2        [101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...   
3      

In [7]:
df.groupby('label').describe()

Unnamed: 0_level_0,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,942621.0,1883.132608,20.579734,1850.0,1866.0,1880.0,1901.0,1920.0
1,1339813.0,1952.192676,13.280689,1921.0,1944.0,1955.0,1963.0,1970.0


In [8]:
df['label'].dtypes

dtype('int64')

In [9]:
# Sample 20,000 rows from each class
df_sampled_0 = df[df['label'] == 0].sample(n=10000, random_state=42)
df_sampled_1 = df[df['label'] == 1].sample(n=10000, random_state=42)

In [10]:
# Combine the two sampled subsets
df_sampled = pd.concat([df_sampled_0, df_sampled_1])

In [11]:
df_sampled.groupby('label').describe()

Unnamed: 0_level_0,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,10000.0,1882.6507,20.526608,1850.0,1865.0,1879.0,1900.0,1920.0
1,10000.0,1952.1757,13.17849,1921.0,1944.0,1955.0,1963.0,1970.0


In [12]:
# Shuffle the combined dataset
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [14]:
df_sampled.to_csv('/Users/shaistasyeda/Desktop/DataSet/bert-dataframe.csv', index=False)

In [15]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from sklearn.model_selection import train_test_split
from torch.nn import CrossEntropyLoss

In [16]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Split data into training and validation sets
train_df, val_df = train_test_split(df_sampled, test_size=0.2, random_state=42)

In [18]:
# Define training parameters
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
batch_size = 8



In [19]:
class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids.iloc[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask.iloc[idx], dtype=torch.long),
            'label': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }


In [20]:
def collate_batch(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.stack(labels)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

In [21]:
# Creating DataLoaders
train_dataset = CustomDataset(train_df['input_ids'], train_df['attention_mask'], train_df['label'])
val_dataset = CustomDataset(val_df['input_ids'], val_df['attention_mask'], val_df['label'])

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [22]:
from tqdm import tqdm

In [23]:
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} Training"):
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch['input_ids'],
            'attention_mask': batch['attention_mask'],
            'labels': batch['labels']  
        }
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        
       # Validation loop
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for val_batch in val_loader:
            val_inputs = {
                'input_ids': val_batch['input_ids'],
                'attention_mask': val_batch['attention_mask'],
                'labels': val_batch['labels']
            }
            val_outputs = model(**val_inputs)
            val_loss += val_outputs.loss.item()

            # Calculate accuracy
            logits = val_outputs.logits
            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == val_batch['labels']).item()
            total_samples += len(predictions)  
            
    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch + 1} - Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 1 Training: 100%|███████████████████| 2000/2000 [5:06:24<00:00,  9.19s/it]


Epoch 1 - Validation Loss: 0.4342, Accuracy: 0.8015


Epoch 2 Training: 100%|███████████████████| 2000/2000 [4:36:05<00:00,  8.28s/it]


Epoch 2 - Validation Loss: 0.3468, Accuracy: 0.8515


Epoch 3 Training: 100%|███████████████████| 2000/2000 [4:31:37<00:00,  8.15s/it]


Epoch 3 - Validation Loss: 0.4823, Accuracy: 0.8105


In [24]:
torch.save(model.state_dict(), '/Users/shaistasyeda/Desktop/DataSet/bert_model_weights_20k.pth')