<a href="https://colab.research.google.com/github/tushant-akar/CS367-Artifical-Intelligence-Lab/blob/main/PlagarismDetector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import string
from queue import PriorityQueue

nltk.download('punkt', quiet=True)

True

In [2]:
def preprocess_text(text):
    sentences = nltk.sent_tokenize(text)
    normalized_sentences = []
    for sentence in sentences:
        sentence = sentence.lower()
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        normalized_sentences.append(sentence)
    return normalized_sentences

In [3]:
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

In [4]:
def heuristic(remaining_sentences1, remaining_sentences2):
    if not remaining_sentences1 or not remaining_sentences2:
        return sum(len(s) for s in remaining_sentences1 + remaining_sentences2)
    return sum(min(levenshtein_distance(s1, s2) for s2 in remaining_sentences2) for s1 in remaining_sentences1)

def a_star_alignment(sentences1, sentences2):
    initial_state = (0, 0, 0, [])  # (pos1, pos2, cost, alignment)
    frontier = PriorityQueue()
    frontier.put((0, initial_state))

    while not frontier.empty():
        _, (pos1, pos2, cost, alignment) = frontier.get()

        if pos1 == len(sentences1) and pos2 == len(sentences2):
            return alignment

        # Align current sentences
        if pos1 < len(sentences1) and pos2 < len(sentences2):
            new_cost = cost + levenshtein_distance(sentences1[pos1], sentences2[pos2])
            new_alignment = alignment + [(pos1, pos2)]
            h = heuristic(sentences1[pos1+1:], sentences2[pos2+1:])
            frontier.put((new_cost + h, (pos1+1, pos2+1, new_cost, new_alignment)))

        # Skip sentence in document 1
        if pos1 < len(sentences1):
            new_cost = cost + len(sentences1[pos1])
            h = heuristic(sentences1[pos1+1:], sentences2[pos2:])
            frontier.put((new_cost + h, (pos1+1, pos2, new_cost, alignment)))

        # Skip sentence in document 2
        if pos2 < len(sentences2):
            new_cost = cost + len(sentences2[pos2])
            h = heuristic(sentences1[pos1:], sentences2[pos2+1:])
            frontier.put((new_cost + h, (pos1, pos2+1, new_cost, alignment)))

    return []  # No alignment found

In [5]:
def detect_plagiarism(doc1, doc2, threshold=0.7):
    sentences1 = preprocess_text(doc1)
    sentences2 = preprocess_text(doc2)

    alignment = a_star_alignment(sentences1, sentences2)

    plagiarism_cases = []
    for i, j in alignment:
        if i < len(sentences1) and j < len(sentences2):
            distance = levenshtein_distance(sentences1[i], sentences2[j])
            max_length = max(len(sentences1[i]), len(sentences2[j]))
            similarity = 1 - (distance / max_length) if max_length > 0 else 1
            plagiarism_cases.append((i, j, similarity, distance))

    return plagiarism_cases

In [6]:
def run_test_case(doc1, doc2, case_name, expected_output):
    print(f"Test Case: {case_name}")
    print("Input:")
    print(f"Document 1: {doc1}")
    print(f"Document 2: {doc2}")

    plagiarism_cases = detect_plagiarism(doc1, doc2)

    print("Actual Output:")
    if plagiarism_cases:
        for i, j, similarity, distance in plagiarism_cases:
            print(f"Sentence {i+1} in Document 1 aligns with Sentence {j+1} in Document 2")
            print(f"Similarity: {similarity:.2f}, Edit Distance: {distance}")
    else:
        print("No significant alignments detected")

    print("\nAnalysis:")
    avg_similarity = sum(sim for _, _, sim, _ in plagiarism_cases) / len(plagiarism_cases) if plagiarism_cases else 0
    avg_distance = sum(dist for _, _, _, dist in plagiarism_cases) / len(plagiarism_cases) if plagiarism_cases else 0
    print(f"Average Similarity: {avg_similarity:.2f}")
    print(f"Average Edit Distance: {avg_distance:.2f}")

    if avg_similarity > 0.9:
        print("Conclusion: Identical or near-identical documents")
    elif avg_similarity > 0.7:
        print("Conclusion: High likelihood of plagiarism")
    elif avg_similarity > 0.5:
        print("Conclusion: Moderate likelihood of plagiarism or partial overlap")
    elif avg_similarity > 0.3:
        print("Conclusion: Low likelihood of plagiarism")
    else:
        print("Conclusion: No plagiarism detected")
    print()

In [7]:
# Test cases
print("Problem: Plagiarism Detection")
print("Implement a plagiarism detection system using sentence alignment and A* search.")
print()

# Test Case 1: Identical Documents
doc1 = "This is a test document. It contains multiple sentences. We aim to detect plagiarism."
doc2 = "This is a test document. It contains multiple sentences. We aim to detect plagiarism."
run_test_case(doc1, doc2, "Identical Documents", "All sentences should align perfectly, with zero edit distance.")

# Test Case 2: Slightly Modified Document
doc1 = "This is an original document. It includes several important points. The goal is to identify similar content."
doc2 = "This is a original paper. It contains multiple important points. The goal is to detect similiar content."
run_test_case(doc1, doc2, "Slightly Modified Document", "Most sentences should align with a low edit distance.")

# Test Case 3: Completely Different Documents
doc1 = "Photosynthesis is a process used by plants. It converts light energy into chemical energy. This energy is stored as glucose."
doc2 = "Machine learning is a subset of AI. It focuses on algorithm development. These algorithms improve computer performance on specific tasks."
run_test_case(doc1, doc2, "Completely Different Documents", "High edit distances for most alignments, indicating no plagiarism.")

# Test Case 4: Partial Overlap
doc1 = "The rapid advancement of technology has significantly transformed the way we communicate. Social media platforms, in particular, have become central to modern communication, influencing both personal and professional interactions. This shift has created new opportunities for connectivity, but also presents challenges related to privacy and information overload."
doc2 = "ocial media platforms have fundamentally altered the landscape of communication in the digital age. While these advancements provide unprecedented opportunities for global connectivity, they also introduce complexities such as privacy concerns and the potential for information overload. The impact of technology on communication is profound, reshaping how we interact on both personal and professional levels."
run_test_case(doc1, doc2, "Partial Overlap", "The overlapping content should align with a low edit distance, indicating potential plagiarism.")

Problem: Plagiarism Detection
Implement a plagiarism detection system using sentence alignment and A* search.

Test Case: Identical Documents
Input:
Document 1: This is a test document. It contains multiple sentences. We aim to detect plagiarism.
Document 2: This is a test document. It contains multiple sentences. We aim to detect plagiarism.
Actual Output:
Sentence 1 in Document 1 aligns with Sentence 1 in Document 2
Similarity: 1.00, Edit Distance: 0
Sentence 2 in Document 1 aligns with Sentence 2 in Document 2
Similarity: 1.00, Edit Distance: 0
Sentence 3 in Document 1 aligns with Sentence 3 in Document 2
Similarity: 1.00, Edit Distance: 0

Analysis:
Average Similarity: 1.00
Average Edit Distance: 0.00
Conclusion: Identical or near-identical documents

Test Case: Slightly Modified Document
Input:
Document 1: This is an original document. It includes several important points. The goal is to identify similar content.
Document 2: This is a original paper. It contains multiple important