<a href="https://colab.research.google.com/github/tubagokhan/RegNLPDataset/blob/main/Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Open and load the dataset from the JSON file
json_path = "/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/SharedTask/Processed_Obligations.json"
with open(json_path, 'r') as file:
    data = json.load(file)

# Extract texts and their corresponding obligation labels from the dataset
texts = [item['Text'] for item in data]
labels = [item['Obligation'] for item in data]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define a machine learning pipeline consisting of TF-IDF vectorization followed by Logistic Regression
pipeline = make_pipeline(TfidfVectorizer(stop_words='english'), LogisticRegression(max_iter=1000))

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = pipeline.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

# Print performance metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Accuracy: 0.8880
Precision: 0.8876
Recall: 0.9955
F1 Score: 0.9384
