In [None]:
import numpy as np
import gensim
from gensim.models import KeyedVectors
from transformers import RobertaTokenizer, RobertaModel, BertTokenizer, BertModel  # ðŸ†• Added Bert
import torch
import re
import time
import ipywidgets as widgets
from IPython.display import display, clear_output
import gensim.downloader as api

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Repository sentences (reference database)
repository_sentences = [
    "artificial intelligence is transforming industries by automating tasks and improving efficiency",
    "climate change is a global challenge that requires immediate attention to mitigate its adverse effects",
    "the rapid advancement of technology has significantly impacted communication and information sharing",
    "renewable energy sources such as solar and wind are crucial for reducing dependence on fossil fuels",
    "data privacy and security have become major concerns in the digital age requiring stringent measures"
]
repo_tokens = [sent.split() for sent in repository_sentences]

# --- Load Word2Vec (with retry mechanism) ---
max_attempts = 3
for attempt in range(max_attempts):
    try:
        word2vec_model = api.load('word2vec-google-news-300')
        print("Word2Vec model loaded successfully!")
        break
    except Exception as e:
        print(f"Attempt {attempt + 1} failed: {e}")
        if attempt < max_attempts - 1:
            time.sleep(5)
        else:
            raise

# --- Sentence vector using Word2Vec ---
def get_sentence_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# --- Load RoBERTa model ---
print("Loading RoBERTa model...")
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base').to(device)
roberta_model.eval()
print("RoBERTa model loaded successfully!")

# --- Load BERT model ---  ðŸ†•
print("Loading BERT model...")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bert_model.eval()
print("BERT model loaded successfully!")

# --- Get embedding using RoBERTa [CLS] (<s>) token ---
def get_roberta_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()

# --- Get embedding using BERT [CLS] token ---  ðŸ†•
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()

# --- Plagiarism Classification Function ---
def classify_plagiarism(input_sentence, repo_sentences, repo_tokens,
                        word2vec_model, roberta_model, roberta_tokenizer,
                        bert_model, bert_tokenizer):  # ðŸ†• added BERT params
    input_sentence = re.sub(r'[^\w\s]', '', input_sentence.lower())
    input_tokens = input_sentence.split()

    features = []
    valid_indices = []
    for i, (repo_sent, repo_tok) in enumerate(zip(repo_sentences, repo_tokens)):
        # Word2Vec similarity
        input_vec = get_sentence_vector(input_tokens, word2vec_model)
        repo_vec = get_sentence_vector(repo_tok, word2vec_model)
        w2v_sim = np.dot(input_vec, repo_vec) / (np.linalg.norm(input_vec) * np.linalg.norm(repo_vec) + 1e-8)

        # RoBERTa similarity
        input_roberta = get_roberta_embedding(input_sentence, roberta_tokenizer, roberta_model)
        repo_roberta = get_roberta_embedding(repo_sent, roberta_tokenizer, roberta_model)
        roberta_sim = np.dot(input_roberta, repo_roberta) / (np.linalg.norm(input_roberta) * np.linalg.norm(repo_roberta) + 1e-8)

        # BERT similarity  ðŸ†•
        input_bert = get_bert_embedding(input_sentence, bert_tokenizer, bert_model)
        repo_bert = get_bert_embedding(repo_sent, bert_tokenizer, bert_model)
        bert_sim = np.dot(input_bert, repo_bert) / (np.linalg.norm(input_bert) * np.linalg.norm(repo_bert) + 1e-8)

        # Word overlap & length difference
        word_overlap = len(set(input_tokens) & set(repo_tok)) / len(set(input_tokens))
        length_diff = abs(len(input_tokens) - len(repo_tok))

        # Skip very low similarity cases early
        if (w2v_sim < 0.6 and bert_sim < 0.6 and roberta_sim < 0.6) or word_overlap < 0.2:
            continue

        features.append([w2v_sim, bert_sim, roberta_sim, word_overlap, length_diff])
        valid_indices.append(i)

    if not features:
        return "No Match", None, None, 0.0, 0.0, 0.0

    features = np.array(features)
    # Combine all three similarities (weights can be tuned) ðŸ†•
    combined_sim = features[:, 0]*0.3 + features[:, 1]*0.3 + features[:, 2]*0.4
    best_match_idx = np.argmax(combined_sim)
    w2v_sim, bert_sim, roberta_sim, word_overlap, length_diff = features[best_match_idx]

    # --- Classification Rules ---
    if all(s > 0.98 for s in [w2v_sim, bert_sim, roberta_sim]) and word_overlap > 0.95 and length_diff == 0:
        predicted_class = "Cut-Paste"
    elif min(w2v_sim, bert_sim, roberta_sim) >= 0.85 and word_overlap >= 0.6:
        predicted_class = "Light Paraphrasing"
    elif min(w2v_sim, bert_sim, roberta_sim) >= 0.6 and word_overlap < 0.6:
        predicted_class = "Heavy Paraphrasing"
    else:
        predicted_class = "No Match"

    similarity_percentage = (w2v_sim*0.3 + bert_sim*0.3 + roberta_sim*0.3 + word_overlap*0.1) * 100
    matched_sentence = repo_sentences[valid_indices[best_match_idx]]

    return predicted_class, matched_sentence, features[best_match_idx], similarity_percentage, bert_sim, roberta_sim  # ðŸ†• return both

# --- Widgets UI for interactive testing ---
input_text = widgets.Text(
    value='',
    placeholder='Enter a sentence to check for plagiarism',
    description='Input:',
    layout={'width': '500px'}
)

classify_button = widgets.Button(
    description='Classify',
    button_style='success',
    tooltip='Click to classify the sentence',
)

output_area = widgets.Output()

def on_classify_button_clicked(b):
    with output_area:
        clear_output()
        input_sentence = input_text.value.strip()
        if not input_sentence:
            print("Please enter a sentence.")
            return
        
        predicted_class, matched_sentence, features, similarity_percentage, bert_sim, roberta_sim = classify_plagiarism(
            input_sentence, repository_sentences, repo_tokens,
            word2vec_model, roberta_model, roberta_tokenizer,
            bert_model, bert_tokenizer
        )
        
        print(f"Input Sentence: {input_sentence}")
        print(f"Matched Sentence: {matched_sentence if matched_sentence else 'None'}")
        print(f"Predicted Plagiarism Type: {predicted_class}")
        print(f"Overall Similarity: {similarity_percentage:.2f}%")
        
        if features is not None:
            w2v_sim, bert_sim, roberta_sim, word_overlap, length_diff = features
            print(f"\n--- Detailed Similarity Metrics ---")
            print(f"â€¢ Word2Vec Similarity:  {w2v_sim:.4f}")
            print(f"â€¢ BERT Similarity:      {bert_sim:.4f}")      # ðŸ†• Added
            print(f"â€¢ RoBERTa Similarity:   {roberta_sim:.4f}")
            print(f"â€¢ Word Overlap:          {word_overlap:.4f}")
            print(f"â€¢ Length Difference:     {length_diff}")
        else:
            print("No features available (no valid match found).")

classify_button.on_click(on_classify_button_clicked)

print("\nPlagiarism Detection Tool (Word2Vec + BERT + RoBERTa Hybrid)\n")
display(input_text, classify_button, output_area)

# Test example
input_text.value = "artificial intelligence is modifying industries by automating jobs and enhancing performance"


Using device: cuda
Word2Vec model loaded successfully!
Loading RoBERTa model...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RoBERTa model loaded successfully!
Loading BERT model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  52%|#####2    | 231M/440M [00:00<?, ?B/s]

Exception in thread Thread-3:
Traceback (most recent call last):
  File "c:\Users\Roy\AppData\Local\Programs\Python\Python311\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\Users\Roy\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\_monitor.py", line 84, in run
    instance.refresh(nolock=True)
  File "c:\Users\Roy\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\std.py", line 1347, in refresh
    self.display()
  File "c:\Users\Roy\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\notebook.py", line 171, in display
    rtext.value = right
    ^^^^^^^^^^^
  File "c:\Users\Roy\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\traitlets.py", line 716, in __set__
    self.set(obj, value)
  File "c:\Users\Roy\AppData\Local\Programs\Python\Python311\Lib\site-packages\traitlets\traitlets.py", line 706, in set
    obj._notify_trait(self.name, old_value, new_value)
  File "c:\Users\Roy\AppData\Local\Program

BERT model loaded successfully!

Plagiarism Detection Tool (Word2Vec + BERT + RoBERTa Hybrid)



Text(value='', description='Input:', layout=Layout(width='500px'), placeholder='Enter a sentence to check for â€¦

Button(button_style='success', description='Classify', style=ButtonStyle(), tooltip='Click to classify the senâ€¦

Output()

In [1]:
import numpy as np
import gensim
from gensim.models import KeyedVectors
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import re
import time
import ipywidgets as widgets
from IPython.display import display, clear_output
from elasticsearch import Elasticsearch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

es = Elasticsearch("http://localhost:9200")
if not es.ping():
    raise ValueError("Elasticsearch connection failed!")
print("Connected to Elasticsearch")

index_name = "sentences"
mapping = {
    "mappings": {
        "properties": {
            "sentence": {"type": "text"},
            "tokens": {"type": "keyword"},
            "word2vec_vector": {"type": "dense_vector", "dims": 300},
            "bert_vector": {"type": "dense_vector", "dims": 768},
        }
    }
}
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)
    print(f"Created index: {index_name}")

# Load Pre-trained Models
import gensim.downloader as api
max_attempts = 3
for attempt in range(max_attempts):
    try:
        word2vec_model = api.load('word2vec-google-news-300')
        print("Word2Vec model loaded successfully!")
        break
    except Exception as e:
        print(f"Attempt {attempt + 1} failed: {e}")
        if attempt < max_attempts - 1:
            time.sleep(5)
        else:
            raise

def get_sentence_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
bert_model.eval()

def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy().squeeze()

# Index Sentences
repository_sentences = [
    "artificial intelligence is transforming industries by automating tasks and improving efficiency",
    "climate change is a global challenge that requires immediate attention to mitigate its adverse effects",
    "the rapid advancement of technology has significantly impacted communication and information sharing",
    "renewable energy sources such as solar and wind are crucial for reducing dependence on fossil fuels",
    "data privacy and security have become major concerns in the digital age requiring stringent measures"
]

def index_sentences(sentences, es, index_name, word2vec_model, bert_model, tokenizer):
    for i, sentence in enumerate(sentences):
        tokens = sentence.lower().split()
        w2v_vector = get_sentence_vector(tokens, word2vec_model).tolist()
        bert_vector = get_bert_embedding(sentence, tokenizer, bert_model).tolist()
        doc = {
            "sentence": sentence,
            "tokens": tokens,
            "word2vec_vector": w2v_vector,
            "bert_vector": bert_vector
        }
        es.index(index=index_name, id=i, body=doc)
    print(f"Indexed {len(sentences)} sentences")

es.delete_by_query(index=index_name, body={"query": {"match_all": {}}})
index_sentences(repository_sentences, es, index_name, word2vec_model, bert_model, tokenizer)

# Updated Plagiarism Detection
def classify_plagiarism(input_sentence, es, index_name, word2vec_model, bert_model, tokenizer):
    input_sentence = re.sub(r'[^\w\s]', '', input_sentence.lower())
    input_tokens = input_sentence.split()
    input_w2v = get_sentence_vector(input_tokens, word2vec_model)
    input_bert = get_bert_embedding(input_sentence, tokenizer, bert_model)
    query = {
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": """
                        (cosineSimilarity(params.query_w2v, 'word2vec_vector') * 0.4 +
                         cosineSimilarity(params.query_bert, 'bert_vector') * 0.6)
                    """,
                    "params": {
                        "query_w2v": input_w2v.tolist(),
                        "query_bert": input_bert.tolist()
                    }
                }
            }
        },
        "size": 5
    }
    response = es.search(index=index_name, body=query)
    hits = response["hits"]["hits"]
    features = []
    valid_hits = []
    for hit in hits:
        repo_sent = hit["_source"]["sentence"]
        repo_tok = hit["_source"]["tokens"]
        w2v_sim = np.dot(input_w2v, hit["_source"]["word2vec_vector"]) / (
            np.linalg.norm(input_w2v) * np.linalg.norm(hit["_source"]["word2vec_vector"]) + 1e-8
        )
        bert_sim = np.dot(input_bert, hit["_source"]["bert_vector"]) / (
            np.linalg.norm(input_bert) * np.linalg.norm(hit["_source"]["bert_vector"]) + 1e-8
        )
        word_overlap = len(set(input_tokens) & set(repo_tok)) / len(set(input_tokens))
        length_diff = abs(len(input_tokens) - len(repo_tok))
        if w2v_sim < 0.6 or word_overlap < 0.2:
            continue
        features.append([w2v_sim, bert_sim, word_overlap, length_diff])
        valid_hits.append(hit)
    if not features:
        return "No Match", None, None, 0.0
    features = np.array(features)
    combined_sim = features[:, 0] * 0.4 + features[:, 1] * 0.6
    best_match_idx = np.argmax(combined_sim)
    w2v_sim, bert_sim, word_overlap, length_diff = features[best_match_idx]
    if w2v_sim > 0.99 and bert_sim > 0.99 and word_overlap > 0.95 and length_diff == 0:
        predicted_class = "Cut-Paste"
    elif w2v_sim >= 0.85 and bert_sim >= 0.85 and word_overlap >= 0.6:
        predicted_class = "Light Paraphrasing"
    elif w2v_sim >= 0.6 and bert_sim >= 0.6 and word_overlap < 0.6:
        predicted_class = "Heavy Paraphrasing"
    else:
        predicted_class = "No Match"
    similarity_percentage = (w2v_sim * 0.4 + bert_sim * 0.4 + word_overlap * 0.2) * 100
    matched_sentence = valid_hits[best_match_idx]["_source"]["sentence"]
    return predicted_class, matched_sentence, features[best_match_idx], similarity_percentage

# UI
input_text = widgets.Text(
    value='',
    placeholder='Enter a sentence to check for plagiarism',
    description='Input:',
    layout={'width': '500px'}
)
classify_button = widgets.Button(
    description='Classify',
    button_style='success',
    tooltip='Click to classify the sentence',
)
output_area = widgets.Output()

def on_classify_button_clicked(b):
    with output_area:
        clear_output()
        input_sentence = input_text.value.strip()
        if not input_sentence:
            print("Please enter a sentence.")
            return
        predicted_class, matched_sentence, features, similarity_percentage = classify_plagiarism(
            input_sentence, es, index_name, word2vec_model, bert_model, tokenizer
        )
        print(f"Input Sentence: {input_sentence}")
        print(f"Matched Sentence: {matched_sentence if matched_sentence else 'None'}")
        print(f"Predicted Plagiarism Type: {predicted_class}")
        print(f"Similarity Percentage: {similarity_percentage:.2f}%")
        print(f"Features (Word2Vec Sim, BERT Sim, Word Overlap, Length Diff): {features if features is not None else 'N/A'}")

classify_button.on_click(on_classify_button_clicked)
print("Plagiarism Detection Tool")
display(input_text, classify_button, output_area)

input_text.value = "artificial intelligence is modifying industries by automating jobs and enhancing performance"

ModuleNotFoundError: No module named 'elasticsearch'