In [10]:
import os
import random
import shutil

directory = r"C:\Users\hp\OneDrive\Desktop\Pyhton\Lex_Res\dataset\IN-Abs\train-data\judgement"

num_files= 50

all_files = os.listdir(directory)

selected_files = random.sample(all_files, num_files)

output_directory = r"C:\Users\hp\OneDrive\Desktop\Pyhton\Lex_Res\dataset\IN-Abs\Lab-1_out"
os.makedirs(output_directory, exist_ok=True)

for file_name in selected_files:
    file_path = os.path.join(directory, file_name)
    shutil.copy(file_path, output_directory)


In [11]:
import os
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


directory = r"C:\Users\hp\OneDrive\Desktop\Pyhton\Lex_Res\dataset\IN-Abs\train-data\judgement"

def read_text_files(directory):
    text = ""
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text += file.read() + " "
    return text

student_answers_text = read_text_files(directory).lower()

stop_words = set(stopwords.words('english'))
filtered_text = ' '.join(word for word in student_answers_text.split() if word not in stop_words)
word_counts = Counter(filtered_text.split())
most_common_words = word_counts.most_common(10)
comm_words = [word for word, _ in most_common_words]


print("10 Most Common Words:")
for word, count in most_common_words:
    print(f"{word}: {count}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


10 Most Common Words:
section: 172951
court: 161300
high: 85603
act: 84716
order: 78759
case: 74943
would: 68493
state: 67644
may: 67037
made: 66592


In [12]:
#DistilBERT

import os
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity


def read_text_files(output_directory):
    instances = []
    for filename in os.listdir(output_directory):
        filepath = os.path.join(output_directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            text = file.read().strip()
            instances.append(text)
    return instances

instances = read_text_files(output_directory)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")


def get_distilbert_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = torch.mean(output.last_hidden_state, dim=1).squeeze().numpy()
    return embeddings


seed_set_embeddings = np.array([get_distilbert_embedding(word) for word in comm_words])
instances_embeddings = np.array([get_distilbert_embedding(instance) for instance in instances])
cosine_similarities = cosine_similarity(instances_embeddings, seed_set_embeddings)


print("Cosine Similarity Scores (DistilBERT):")
print(cosine_similarities)


Cosine Similarity Scores (DistilBERT):
[[0.20146713 0.24047151 0.17923944 ... 0.1234692  0.2279946  0.26077187]
 [0.21442153 0.23868933 0.1944493  ... 0.1361439  0.2516606  0.27307642]
 [0.2198585  0.24854828 0.18504138 ... 0.13990173 0.24965522 0.27396134]
 ...
 [0.20447534 0.24261372 0.1759761  ... 0.12676504 0.2408554  0.26743725]
 [0.20669276 0.24383539 0.17624404 ... 0.13335696 0.23383984 0.26146773]
 [0.2270654  0.2582651  0.19604611 ... 0.14498278 0.2713579  0.29332423]]


In [14]:
import pandas as pd
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity

file_path = r"C:\Users\hp\OneDrive\Desktop\Pyhton\Lex_Res\dataset\IN-Abs\Lab-1_out"
instances = file_path

seed_set = ['section', 'court', 'high', 'act', 'order', 'case', 'would', 'state', 'may', 'made']

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained("roberta-base")

def get_roberta_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**encoded_input)
    embeddings = torch.mean(output.last_hidden_state, dim=1).squeeze().numpy()
    return embeddings


seed_set_embeddings = np.array([get_roberta_embedding(word) for word in seed_set])
instances_embeddings = np.array([get_roberta_embedding(instance) for instance in instances])
cosine_similarities_roberta = cosine_similarity(instances_embeddings, seed_set_embeddings)

print("Cosine Similarity Scores (RoBERTa):")
print(cosine_similarities_roberta)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Cosine Similarity Scores (RoBERTa):
[[0.98738515 0.98761845 0.9869428  0.98756915 0.9873197  0.9864273
  0.9870385  0.9867585  0.9873066  0.9873691 ]
 [0.9817469  0.98200715 0.98042893 0.9835068  0.98315895 0.9802439
  0.9830696  0.98097354 0.9835026  0.98248255]
 [0.9888695  0.9895716  0.9868641  0.98917085 0.989339   0.9877735
  0.98951846 0.98727435 0.9903146  0.9890726 ]
 [0.9872854  0.9888688  0.98796463 0.9879556  0.98944354 0.98793334
  0.9889802  0.9898157  0.98932904 0.9887208 ]
 [0.99117035 0.9895297  0.9892731  0.9891218  0.9902905  0.9887982
  0.99014986 0.99071336 0.99050415 0.9902965 ]
 [0.98890465 0.9874206  0.9860842  0.98816425 0.987254   0.98551023
  0.98664296 0.98831505 0.9870095  0.98679644]
 [0.9899863  0.987856   0.98813766 0.9886341  0.98848426 0.9866415
  0.9879627  0.9883324  0.98866683 0.98831004]
 [0.9911703  0.98952985 0.9892732  0.98912174 0.9902905  0.98879826
  0.99015    0.99071354 0.99050415 0.9902965 ]
 [0.9888695  0.9895716  0.9868641  0.98917085 0.9

In [15]:
import pandas as pd
import numpy as np
import fasttext
from sklearn.metrics.pairwise import cosine_similarity

instances = r"C:\Users\hp\OneDrive\Desktop\Pyhton\Lex_Res\dataset\IN-Abs\Lab-1_out"

seed_set = ['section', 'court', 'high', 'act', 'order', 'case', 'would', 'state', 'may', 'made']

fasttext_model = fasttext.load_model('path_to_fasttext_model.bin') 

seed_set_embeddings_fasttext = np.array([fasttext_model.get_word_vector(word) for word in seed_set])
instances_embeddings_fasttext = np.array([fasttext_model.get_sentence_vector(instance) for instance in instances])
cosine_similarities_fasttext = cosine_similarity(instances_embeddings_fasttext, seed_set_embeddings_fasttext)

print("Cosine Similarity Scores (FastText):")
print(cosine_similarities_fasttext)


ModuleNotFoundError: No module named 'fasttext'