## Importing the libraries

In [33]:
import spacy
import pandas as pd

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Loading the dataset

In [15]:
actual_df = pd.read_csv('dataset.csv')

print(actual_df.columns, '\n')

df = actual_df.iloc[:, 1:]

df

Index(['Unnamed: 0', 'source_text', 'plagiarized_text', 'label'], dtype='object') 



Unnamed: 0,source_text,plagiarized_text,label
0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1
...,...,...,...
365,Playing musical instruments enhances creativity.,Creativity is enhanced by playing musical inst...,0
366,Studying history helps in understanding the pr...,Understanding the present is aided by studying...,0
367,Listening to classical music can improve focus.,Focus is improved by listening to classical mu...,0
368,Practicing yoga enhances physical flexibility.,Physical flexibility is enhanced by practicing...,0


## DataFrame information

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source_text       370 non-null    object
 1   plagiarized_text  370 non-null    object
 2   label             370 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 8.8+ KB


## NLP techniques

### Removing stop words and punctuations

In [17]:
df['source_text'] = df['source_text'].str.lower()
df['plagiarized_text'] = df['plagiarized_text'].str.lower()

print('Source Text:',df['source_text'][0], '\n')
print('Plagiarized Text:', df['plagiarized_text'][0], '\n')

nlp = spacy.load('en_core_web_sm')

df['source_text'] = df['source_text'].map(lambda text: ' '.join([token.text for token in nlp(text) if not token.is_stop and not token.is_punct]))

df['plagiarized_text'] = df['plagiarized_text'].map(lambda text: ' '.join([token.text for token in nlp(text) if not token.is_stop and not token.is_punct]))

print('Updated Source Text:',df['source_text'][0], '\n')
print('Updated Plagiarized Text:', df['plagiarized_text'][0], '\n')

df

Source Text: researchers have discovered a new species of butterfly in the amazon rainforest. 

Plagiarized Text: scientists have found a previously unknown butterfly species in the amazon jungle. 

Updated Source Text: researchers discovered new species butterfly amazon rainforest 

Updated Plagiarized Text: scientists found previously unknown butterfly species amazon jungle 



Unnamed: 0,source_text,plagiarized_text,label
0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,moon orbits earth approximately 27.3 days,natural satellite takes 27.3 days complete orb...,1
2,water composed hydrogen atoms oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,history rome dates 753 bc,rome long history traced 753 bc,1
4,pluto considered ninth planet solar system,past pluto classified ninth planet sun planeta...,1
...,...,...,...
365,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0
366,studying history helps understanding present,understanding present aided studying history,0
367,listening classical music improve focus,focus improved listening classical music,0
368,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


## Data preparation

In [25]:
total_text = df['source_text'] + " " + df['plagiarized_text']

print(total_text[0])

vector = TfidfVectorizer()

X = vector.fit_transform(total_text)
y = df['label']

# X_df = pd.DataFrame(X.toarray(), columns = vector.get_feature_names_out())
# X_df

researchers discovered new species butterfly amazon rainforest scientists found previously unknown butterfly species amazon jungle


## Splitting the dataset

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Model creation

In [37]:
model = SVC(kernel = 'linear', C = 1, gamma = 'auto', probability = True)

## Model training

In [38]:
model.fit(X_train, y_train)

## Model prediction

In [39]:
y_predicted = model.predict(X_test)

y_predicted

array([1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1], dtype=int64)

## Model metrics

In [55]:
accuracy = accuracy_score(y_predicted, y_test)

print('Accuracy Score: ', accuracy)

print('Classification Report \n', classification_report(y_predicted, y_test))

Accuracy Score:  0.9009009009009009
Classification Report 
               precision    recall  f1-score   support

           0       0.89      0.88      0.88        48
           1       0.91      0.92      0.91        63

    accuracy                           0.90       111
   macro avg       0.90      0.90      0.90       111
weighted avg       0.90      0.90      0.90       111



## Prediction

In [98]:
text = input('Enter text:')

data = ' '.join([token.text for token in nlp(text) if not token.is_stop and not token.is_punct])

tfidf_text = vector.transform([data])

result = model.predict_proba(tfidf_text)

if result[0][0] < result[0][1]:
    print(f'Plagiarism Detected with accuracy of {(result[0][1]*100):.2f}%')
else:
    print(f'Plagiarism not detected with accuracy of {(result[0][0]*100):.2f} %')

Enter text: "Many species of flora and fauna can be found in rainforests around the world."


Plagiarism Detected with accuracy of 78.64%
