In [None]:
import os

# Set the directory path
directory = "/Users/shaistasyeda/Desktop/DataSet/TextFiles"

# Create a new file to store the merged contents
for year in range(1803, 2007):
    output_file_path = f"{year}-merged.txt"

    with open(output_file_path, "w") as output_file:
        # Loop through all files in the directory
        for filename in os.listdir(directory):
            # Check if the filename contains the year
            if str(year) in filename:
                # Open the file and read the contents
                with open(os.path.join(directory, filename), "r") as file:
                    contents = file.read()
                    # Write the contents to the merged file
                    output_file.write(contents)

    print(f'Merged data for {year} and saved to {output_file_path}')

In [1]:
import os
import re
import pandas as pd
from transformers import BertTokenizer

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Path to data files
data_directory = '/Users/shaistasyeda/Desktop/DataSet/Merged-files'

In [None]:
# Function to tokenize and format data
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data = file.readlines()

    # Extract year from the file name using regular expression
    year_match = re.search(r'(\d{4})-merged\.txt', file_path)
    if year_match:
        year = int(year_match.group(1))
    else:
        raise ValueError(f"Could not extract the year from the file name: {file_path}")

    # Tokenize and format data for BERT
    tokenized_data = []
    for text in text_data:
        tokenized_input = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
        tokenized_data.append({
            'text': text.strip(),  # Add the actual text to the DataFrame
            'input_ids': tokenized_input['input_ids'].squeeze().tolist(),
            'attention_mask': tokenized_input['attention_mask'].squeeze().tolist(),
            'year': year
        })

    return tokenized_data

In [None]:
# Process all files in the data directory
all_data = []
for filename in os.listdir(data_directory):
    if filename.endswith("-merged.txt"):
        file_path = os.path.join(data_directory, filename)
        tokenized_data = process_file(file_path)
        all_data.extend(tokenized_data)
        # Delete the original file
        os.remove(file_path)


In [None]:
# Convert the tokenized data to a Pandas DataFrame
df = pd.DataFrame(all_data)

In [None]:
# Save the DataFrame 
df.to_parquet('/Users/shaistasyeda/Desktop/DataSet/tokenized_data_1.parquet')

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.dataset import random_split
import pandas as pd

In [12]:
# Load Parquet DataFrame
df = pd.read_parquet('/Users/shaistasyeda/Desktop/DataSet/tokenized_data_1.parquet')

In [13]:
# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
                                                      text  \
0        acquainted the House that their Address of Wed...   
1        (Answered by Mr. A. J. Balfour.) I understand ...   
2        (Answered by Mr. A. J. Balfour.) I am afraid I...   
3        (Answered by Mr. Walter Long.) No opportunity ...   
4        (Answered by Mr. Bonar Law.) The value of the ...   
...                                                    ...   
2282429  is the appropriate heading to which they shoul...   
2282430  from interest allowable and interest disallowa...   
2282431  premium bonds, for which there is a fixed limi...   
2282432  respect of the financial year 1966–67, the num...   
2282433  the Order, was to stop an unnecessary strain o...   

                                                 input_ids  \
0        [101, 19056, 1996, 2160, 2008, 2037, 4769, 199...   
1        [101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...   
2        [101, 1006, 4660, 2011, 2720, 1012, 1037

In [14]:
# Function to apply the labeling
def label_year(year):
    if 1850 <= year <= 1920:
        return 0
    else:
        return 1

In [15]:
df['label'] = df['year'].apply(label_year)
print(df)

                                                      text  \
0        acquainted the House that their Address of Wed...   
1        (Answered by Mr. A. J. Balfour.) I understand ...   
2        (Answered by Mr. A. J. Balfour.) I am afraid I...   
3        (Answered by Mr. Walter Long.) No opportunity ...   
4        (Answered by Mr. Bonar Law.) The value of the ...   
...                                                    ...   
2282429  is the appropriate heading to which they shoul...   
2282430  from interest allowable and interest disallowa...   
2282431  premium bonds, for which there is a fixed limi...   
2282432  respect of the financial year 1966–67, the num...   
2282433  the Order, was to stop an unnecessary strain o...   

                                                 input_ids  \
0        [101, 19056, 1996, 2160, 2008, 2037, 4769, 199...   
1        [101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...   
2        [101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...   
3      

In [16]:
df.groupby('label').describe()

Unnamed: 0_level_0,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,942621.0,1883.132608,20.579734,1850.0,1866.0,1880.0,1901.0,1920.0
1,1339813.0,1952.192676,13.280689,1921.0,1944.0,1955.0,1963.0,1970.0


In [17]:
df['label'].dtypes

dtype('int64')

In [18]:
import pandas as pd

# Define a list of keywords related to women
keywords = ['woman', 'women', 'she', 'her', 'female', 'girl', 'lady', 'mother', 'daughter', 'aunt', 'niece']

# Create a boolean mask for sentences that contain any of the keywords
mask = df['text'].apply(lambda x: any(keyword in x.lower() for keyword in keywords))

# Create a new DataFrame with only sentences that contain the keywords
df_women_related = pd.DataFrame(df[mask])


In [19]:
df_women_related

Unnamed: 0,text,input_ids,attention_mask,year,label
0,acquainted the House that their Address of Wed...,"[101, 19056, 1996, 2160, 2008, 2037, 4769, 199...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1905,0
2,(Answered by Mr. A. J. Balfour.) I am afraid I...,"[101, 1006, 4660, 2011, 2720, 1012, 1037, 1012...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1905,0
3,(Answered by Mr. Walter Long.) No opportunity ...,"[101, 1006, 4660, 2011, 2720, 1012, 4787, 2146...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1905,0
21,"Cattle , , Oats. , , : To ask the Secretary ...","[101, 7125, 1010, 1010, 1051, 11149, 1012, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1905,0
22,(Answered by Mr. Secretary Akers-Douglas.) Thi...,"[101, 1006, 4660, 2011, 2720, 1012, 3187, 1771...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1905,0
...,...,...,...,...,...
2282429,is the appropriate heading to which they shoul...,"[101, 2003, 1996, 6413, 5825, 2000, 2029, 2027...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1969,1
2282430,from interest allowable and interest disallowa...,"[101, 2013, 3037, 3499, 3085, 1998, 3037, 4487...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1969,1
2282431,"premium bonds, for which there is a fixed limi...","[101, 12882, 9547, 1010, 2005, 2029, 2045, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1969,1
2282432,"respect of the financial year 1966–67, the num...","[101, 4847, 1997, 1996, 3361, 2095, 3547, 1516...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1969,1


In [20]:
df_women_related.to_csv('/Users/shaistasyeda/Desktop/DataSet/bert-dataframe-related-woman.csv', index=False)

In [12]:
df_women_related.groupby('label').describe()

Unnamed: 0_level_0,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,405397.0,1884.574901,20.989863,1850.0,1867.0,1883.0,1902.0,1920.0
1,758440.0,1952.488851,13.120405,1921.0,1945.0,1955.0,1963.0,1970.0


In [21]:
df = df_women_related

In [22]:
# Sample 20,000 rows from each class
df_sampled_0 = df[df['label'] == 0].sample(n=20000, random_state=42)
df_sampled_1 = df[df['label'] == 1].sample(n=20000, random_state=42)

In [23]:
# Combine the two sampled subsets
df_sampled = pd.concat([df_sampled_0, df_sampled_1])

In [24]:
df_sampled.groupby('label').describe()

Unnamed: 0_level_0,year,year,year,year,year,year,year,year
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,20000.0,1884.5336,21.076335,1850.0,1867.0,1883.0,1902.0,1920.0
1,20000.0,1952.4793,13.169033,1921.0,1945.0,1955.0,1963.0,1970.0


In [25]:
# Shuffle the combined dataset
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
df_sampled.to_csv('/Users/shaistasyeda/Desktop/DataSet/bert-dataframe-w.csv', index=False)

In [1]:
import pandas as pd
df =pd.read_csv('/Users/shaistasyeda/Desktop/DataSet/bert-dataframe-w.csv')

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Extract text and labels
texts = df['text'].tolist()
labels = df['label'].tolist()

In [4]:
# Initialize bert-small tokenizer
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-4_H-512_A-8')

In [5]:
# Tokenize the data
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

In [6]:
# Split data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [7]:
# Tokenize the train and validation sets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [8]:
# Convert labels into torch tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

In [9]:
# Define a custom dataset
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
# Create dataset objects
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

In [11]:
# Prepare data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [12]:
# Load bert-small model
model = BertForSequenceClassification.from_pretrained('google/bert_uncased_L-4_H-512_A-8', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-4_H-512_A-8 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [14]:
# Training and validation loop
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss, train_correct = 0, 0
    for batch in train_loader:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        train_loss += loss.item()
        train_correct += (outputs.logits.argmax(dim=-1) == batch['labels']).sum().item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = train_correct / len(train_loader.dataset)

    # Validation
    model.eval()
    val_loss, val_correct = 0, 0
    for batch in val_loader:
        batch = {k: v.to(model.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        val_loss += outputs.loss.item()
        val_correct += (outputs.logits.argmax(dim=-1) == batch['labels']).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = val_correct / len(val_loader.dataset)

    print(f'Epoch {epoch + 1}:')
    print(f'Training Loss: {avg_train_loss}, Training Accuracy: {train_accuracy}')
    print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}')

Epoch 1:
Training Loss: 0.27244958000630143, Training Accuracy: 0.88253125
Validation Loss: 0.21988102419674396, Validation Accuracy: 0.90625
Epoch 2:
Training Loss: 0.15135690008476377, Training Accuracy: 0.938625
Validation Loss: 0.19077790389582513, Validation Accuracy: 0.921875
Epoch 3:
Training Loss: 0.08454910147190094, Training Accuracy: 0.968
Validation Loss: 0.21479382083564996, Validation Accuracy: 0.9235


In [15]:
# Save the model
model.save_pretrained('/Users/shaistasyeda/Desktop/DataSet/bert-model-W')
tokenizer.save_pretrained('/Users/shaistasyeda/Desktop/DataSet/bert-tokenizer-W')

('/Users/shaistasyeda/Desktop/DataSet/bert-tokenizer-W/tokenizer_config.json',
 '/Users/shaistasyeda/Desktop/DataSet/bert-tokenizer-W/special_tokens_map.json',
 '/Users/shaistasyeda/Desktop/DataSet/bert-tokenizer-W/vocab.txt',
 '/Users/shaistasyeda/Desktop/DataSet/bert-tokenizer-W/added_tokens.json')

In [16]:
# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('/Users/shaistasyeda/Desktop/DataSet/bert-model-W')
tokenizer = BertTokenizer.from_pretrained('/Users/shaistasyeda/Desktop/DataSet/bert-tokenizer-W')

In [17]:
# Ensure the model is in evaluation mode
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, e

In [18]:
def classify_statement(statement):
    # Tokenize the statement
    inputs = tokenizer(statement, return_tensors="pt", padding=True, truncation=True, max_length=128)

    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    # Predict
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get the predicted class (0 or 1)
    predicted_class = torch.argmax(probabilities, dim=-1).item()

    return predicted_class


In [19]:
# Example usage
statement = "women are seducers"
prediction = classify_statement(statement)
print(f"The statement is classified as class {prediction}")

The statement is classified as class 0
