In [None]:
import nltk
nltk.download('popular')
nltk.download('stopwords')
nltk.download('word_tokenize')
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

In [None]:
data = pd.read_csv("/content/dataset(1).csv")

In [None]:
def preprocess_text(text):
    text = str(text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text

In [None]:
data['source_text'] = data['source_text'].apply(preprocess_text)
data['plagiarized_text'] = data['plagiarized_text'].apply(preprocess_text)

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
x = tfidf_vectorizer.fit_transform(data['source_text'] + " " + data['plagiarized_text'])
y = data['label']


In [None]:
# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression Model
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Logistic Regression Accuracy: 0.8243243243243243
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        35
           1       0.86      0.79      0.83        39

    accuracy                           0.82        74
   macro avg       0.83      0.83      0.82        74
weighted avg       0.83      0.82      0.82        74

[[30  5]
 [ 8 31]]


In [None]:
# BLEU Score Calculation
def calculate_bleu(reference_text, candidate_text):
    reference_tokens = word_tokenize(reference_text)
    candidate_tokens = word_tokenize(candidate_text)
    return sentence_bleu([reference_tokens], candidate_tokens)

In [None]:
!pip install nltk
import nltk
import nltk
nltk.download('punkt_tab')

# ... your existing code ...
nltk.download('punkt_tab') # Download the missing data package
data['bleu_score'] = data.apply(lambda row: calculate_bleu(row['source_text'], row['plagiarized_text']), axis=1)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
pickle.dump(model, open("model.pkl", "wb"))
pickle.dump(tfidf_vectorizer, open("tfidf_vectorizer.pkl", "wb"))

In [None]:
tfidf_vectorizer = pickle.load(open("tfidf_vectorizer.pkl", "rb"))
model = pickle.load(open("model.pkl", "rb"))

In [None]:
def detect(input_text):
    vectorized_text = tfidf_vectorizer.transform([input_text])
    result = model.predict(vectorized_text)
    return "Plagiarism detected" if result[0] == 1 else "No plagiarism"

In [None]:
input_text="playing musical instruments enhances creativity"
detect(input_text)

'No plagiarism'

In [None]:
data

Unnamed: 0.1,Unnamed: 0,source_text,plagiarized_text,label,bleu_score
0,0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1,1.425661e-231
1,1,moon orbits earth approximately 273 days,natural satellite takes around 273 days comple...,1,5.477489e-155
2,2,water composed two hydrogen atoms one oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1,9.170599e-155
3,3,history rome dates back 753 bc,rome long history traced back 753 bc,1,5.705337e-78
4,4,pluto considered ninth planet solar system,past pluto classified ninth planet suns planet...,1,7.711524e-155
...,...,...,...,...,...
365,397,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0,7.380245e-78
366,398,studying history helps understanding present,understanding present aided studying history,0,1.186280e-154
367,399,listening classical music improve focus,focus improved listening classical music,0,7.380245e-78
368,400,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0,1.186280e-154
