In [None]:
import os
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from PIL import Image
import pytesseract
import textract
pytesseract.pytesseract.tesseract_cmd =r"C:\Program Files\Tesseract-OCR"

def read_data_from_folder(folder_path):
    data = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    for file_name in os.listdir(folder_path):
        # Read and process each file in the folder (e.g., extract text from images, documents)
        # Placeholder logic - replace with actual data processing steps
        text_data = process_file(os.path.join(folder_path, file_name))
        
        # Tokenize text data
        encoding = tokenizer(text_data, truncation=True, padding=True)
        
        # Append processed data to a list
        data.append(encoding)
    
    return data

def process_file(file_path):
    text_data = ""
    
    # Check file extension to determine processing method
    file_extension = file_path.split('.')[-1].lower()
    
    if file_extension == 'png' or file_extension == 'jpg' or file_extension == 'jpeg':
        # Process image files using OCR (Pytesseract)
        image = Image.open(file_path)
        text_data = pytesseract.image_to_string(image, lang='eng')
    elif file_extension == 'docx':
        # Process Word document files using textract
        text_data = textract.process(file_path).decode('utf-8')
    
    return text_data






# Load metadata labels
train_labels = pd.read_csv('train.csv')
test_labels = pd.read_csv('test.csv')

# Read and process data from train and test folders
train_data = read_data_from_folder(r"C:\Users\vk100\Desktop\assignment\data\train")
test_data = read_data_from_folder(r"C:\Users\vk100\Desktop\assignment\data\test")


# Convert train_data and test_data to lists of strings if needed
train_texts = [str(data) for data in train_data]
test_texts = [str(data) for data in test_data]



# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Tokenize and encode text data
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# Convert non-numeric data to numeric
train_labels = train_labels.apply(pd.to_numeric, errors='ignore')

# Convert non-numeric data to numeric in specific columns
numeric_columns = ['Agreement Value', 'Agreement Start Date', 'Agreement End Date', 'Renewal Notice (Days)']
train_labels[numeric_columns] = train_labels[numeric_columns].apply(pd.to_numeric, errors='coerce')



# Convert labels to tensor
train_labels_tensor = torch.tensor(train_labels[['Agreement Value', 'Agreement Start Date', 'Agreement End Date', 'Renewal Notice', 'Party One', 'Party Two']].values)

if not set(train_labels.columns).issubset(set(train_labels.index)):
    train_labels.set_index(train_labels.columns, inplace=True)

# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):
    for i in range(len(train_data)):
        optimizer.zero_grad()
        input_ids = train_encodings['input_ids'][i]
        attention_mask = train_encodings['attention_mask'][i]
        
        # Ensure input data and labels have the same size
        if input_ids.size() != labels.size():
            raise ValueError("Input data and labels must have the same size.")
            
        outputs = model(input_ids=torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]), labels=train_labels_tensor[i])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Testing
for i in range(len(test_data)):
    input_ids = test_encodings['input_ids'][i]
    attention_mask = test_encodings['attention_mask'][i]
    # Ensure input data and labels have the same size
        if input_ids.size() != labels.size():
            raise ValueError("Input data and labels must have the same size.")
            
    outputs = model(input_ids=torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
    predicted_labels = outputs.logits


In [None]:
from sklearn.metrics import recall_score

# Function to calculate recall for each field
def calculate_recall(true_values, predicted_values):
    recalls = []
    for i in range(true_values.shape[1]):
        recall = recall_score(true_values[:, i], predicted_values[:, i], average='binary')
        recalls.append(recall)
    return recalls

# Testing
predicted_labels_list = []
for i in range(len(test_data)):
    input_ids = test_encodings['input_ids'][i]
    attention_mask = test_encodings['attention_mask'][i]
    outputs = model(input_ids=torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
    predicted_labels = outputs.logits
    predicted_labels_list.append(predicted_labels)

# Convert the list of predicted labels to a numpy array
predicted_labels_array = torch.cat(predicted_labels_list).detach().numpy()

# Convert the ground truth labels to a numpy array
true_labels_array = test_labels[['Agreement Value', 'Agreement Start Date', 'Agreement End Date', 'Renewal Notice', 'Party One', 'Party Two']].values

# Calculate recall for each field
recalls = calculate_recall(true_labels_array, predicted_labels_array)

print("Recall for each field:")
for i, field in enumerate(['Agreement Value', 'Agreement Start Date', 'Agreement End Date', 'Renewal Notice', 'Party One', 'Party Two']):
    print(f"{field}: {recalls[i]}")