In [10]:
# Install required libraries
!pip install scikit-learn pandas numpy nltk transformers torch datasets

# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification




In [11]:
import pandas as pd

# Load the dataset
df = pd.read_csv("dataset.txt", delimiter="\t", header=None, names=["sentence1", "sentence2", "label"], encoding="utf-8", on_bad_lines="skip")

# Remove rows where any column is empty
df = df.dropna()

# Convert label column to integers
df["label"] = df["label"].astype(int)

# Display the first few rows
print(df.head())


                                           sentence1  \
1  A person on a horse jumps over a broken down a...   
2              Children smiling and waving at camera   
3              Children smiling and waving at camera   
4  A boy is jumping on skateboard in the middle o...   
5  A boy is jumping on skateboard in the middle o...   

                             sentence2  label  
1    A person is outdoors, on a horse.      1  
2           There are children present      1  
3                The kids are frowning      0  
4    The boy skates down the sidewalk.      0  
5  The boy does a skateboarding trick.      1  


In [13]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already available
nltk.download("stopwords")

# Define stop words
stop_words = set(stopwords.words("english"))

# Function to clean text
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df[["sentence1", "sentence2"]], df["label"], test_size=0.2, random_state=42
)


In [16]:
# Convert Text Pairs into Combined Strings
X_train_combined = X_train["sentence1"] + " " + X_train["sentence2"]
X_test_combined = X_test["sentence1"] + " " + X_test["sentence2"]

In [17]:
# Convert Text to TF-IDF Numerical Vectors
vectorizer = TfidfVectorizer(max_features=5000)  # Keep top 5000 important words
X_train_vec = vectorizer.fit_transform(X_train_combined)
X_test_vec = vectorizer.transform(X_test_combined)


In [18]:
# Train Small Language Model (Logistic Regression)
model = LogisticRegression()
model.fit(X_train_vec, y_train)


In [19]:
# Test Model Performance
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)

print("\n🔹 Model Accuracy:", accuracy)
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))


🔹 Model Accuracy: 0.6550207364584643

🔹 Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.65      0.65      3979
           1       0.65      0.66      0.66      3978

    accuracy                           0.66      7957
   macro avg       0.66      0.66      0.66      7957
weighted avg       0.66      0.66      0.66      7957



In [21]:
# Predict on New Sentence Pairs
def predict_plagiarism(text1, text2):
    text1 = clean_text(text1)
    text2 = clean_text(text2)
    combined_text = text1 + " " + text2
    text_vec = vectorizer.transform([combined_text])
    prediction = model.predict(text_vec)
    return "Plagiarized" if prediction[0] == 1 else "Original"

# Example Usage 0
sentence1 = "A person is riding a horse in a field."
sentence2 = "Someone is outdoors riding a horse."
print("\nPrediction0:", predict_plagiarism(sentence1, sentence2))

# Test Case 1: Similar meaning
sentence1 = "The dog is running in the park."
sentence2 = "A canine is sprinting through the playground."
print("\nPrediction1:", predict_plagiarism(sentence1, sentence2))  # Expected: Plagiarized (or similar)

# Test Case 2: Completely different
sentence1 = "The sky is blue with a few clouds."
sentence2 = "I love eating pizza with extra cheese."
print("\nPrediction2:", predict_plagiarism(sentence1, sentence2))  # Expected: Original (not similar)

# Test Case 3: Paraphrased sentence
sentence1 = "She is reading a fascinating book about space."
sentence2 = "A woman is engaged in an interesting space-related book."
print("\nPrediction3:", predict_plagiarism(sentence1, sentence2))  # Expected: Plagiarized (or similar)

# Test Case 4: Opposite meanings
sentence1 = "The cat is sleeping peacefully."
sentence2 = "The cat is running around the house."
print("\nPrediction4:", predict_plagiarism(sentence1, sentence2))  # Expected: Original (different meaning)

# Test Case 5: Short identical phrases
sentence1 = "He won the race."
sentence2 = "He won the race."
print("\nPrediction5:", predict_plagiarism(sentence1, sentence2))  # Expected: Plagiarized (identical)





Prediction0: Plagiarized

Prediction1: Plagiarized

Prediction2: Original

Prediction3: Original

Prediction4: Original

Prediction5: Plagiarized
