## Loading the dataset

In [1]:
import pandas as pd

df = pd.read_csv('dataset.csv')

df = df.iloc[:, 1:]

df

Unnamed: 0,source_text,plagiarized_text,label
0,Researchers have discovered a new species of b...,Scientists have found a previously unknown but...,1
1,The moon orbits the Earth in approximately 27....,Our natural satellite takes around 27.3 days t...,1
2,Water is composed of two hydrogen atoms and on...,H2O consists of 2 hydrogen atoms and 1 oxygen ...,1
3,The history of Rome dates back to 753 BC.,Rome has a long history that can be traced bac...,1
4,Pluto was once considered the ninth planet in ...,"In the past, Pluto was classified as the ninth...",1
...,...,...,...
365,Playing musical instruments enhances creativity.,Creativity is enhanced by playing musical inst...,0
366,Studying history helps in understanding the pr...,Understanding the present is aided by studying...,0
367,Listening to classical music can improve focus.,Focus is improved by listening to classical mu...,0
368,Practicing yoga enhances physical flexibility.,Physical flexibility is enhanced by practicing...,0


## Dataframe information

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370 entries, 0 to 369
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   source_text       370 non-null    object
 1   plagiarized_text  370 non-null    object
 2   label             370 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 8.8+ KB


## Dataframe missing values

In [3]:
df.isna().sum()

source_text         0
plagiarized_text    0
label               0
dtype: int64

## Output categories

In [4]:
df['label'].value_counts()

label
0    187
1    183
Name: count, dtype: int64

## Cleaning the dataset

In [5]:
df['source_text'] = df['source_text'].str.lower()
df['plagiarized_text'] = df['plagiarized_text'].str.lower()

df

Unnamed: 0,source_text,plagiarized_text,label
0,researchers have discovered a new species of b...,scientists have found a previously unknown but...,1
1,the moon orbits the earth in approximately 27....,our natural satellite takes around 27.3 days t...,1
2,water is composed of two hydrogen atoms and on...,h2o consists of 2 hydrogen atoms and 1 oxygen ...,1
3,the history of rome dates back to 753 bc.,rome has a long history that can be traced bac...,1
4,pluto was once considered the ninth planet in ...,"in the past, pluto was classified as the ninth...",1
...,...,...,...
365,playing musical instruments enhances creativity.,creativity is enhanced by playing musical inst...,0
366,studying history helps in understanding the pr...,understanding the present is aided by studying...,0
367,listening to classical music can improve focus.,focus is improved by listening to classical mu...,0
368,practicing yoga enhances physical flexibility.,physical flexibility is enhanced by practicing...,0


## Nlp techniques

### source_text

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')

# print(df['source_text'].dtypes, '\n')

# for i in df['source_text'][:10]:
#     print(i[:10], type(i))
# print()

df['source_text'] = df['source_text'].apply(lambda x: nlp(x))

# for i in df['source_text'][:10]:
#     print(i[:1], type(i))

for text in df['source_text'][:10]:
    print(text)
    final = [
        token
        for token in text 
        if not token.is_stop and not token.is_punct
    ]
    print(final)
    final = ' '.join([token.text for token in final])
    print(final)
    print()
    
df['source_text'] = df['source_text'].map(lambda text: ' '.join([token.text for token in text if not token.is_stop and not token.is_punct]))

researchers have discovered a new species of butterfly in the amazon rainforest.
[researchers, discovered, new, species, butterfly, amazon, rainforest]
researchers discovered new species butterfly amazon rainforest

the moon orbits the earth in approximately 27.3 days.
[moon, orbits, earth, approximately, 27.3, days]
moon orbits earth approximately 27.3 days

water is composed of two hydrogen atoms and one oxygen atom.
[water, composed, hydrogen, atoms, oxygen, atom]
water composed hydrogen atoms oxygen atom

the history of rome dates back to 753 bc.
[history, rome, dates, 753, bc]
history rome dates 753 bc

pluto was once considered the ninth planet in our solar system.
[pluto, considered, ninth, planet, solar, system]
pluto considered ninth planet solar system

this is a unique and original sentence.
[unique, original, sentence]
unique original sentence

artificial intelligence is reshaping industries.
[artificial, intelligence, reshaping, industries]
artificial intelligence reshapin

### plagiarized text

In [7]:
df['plagiarized_text'] = df['plagiarized_text'].apply(lambda x: nlp(x))

for text in df['plagiarized_text'][:10]:
    print(text)
    final = [
        token
        for token in text 
        if not token.is_stop and not token.is_punct
    ]
    print(final)
    final = ' '.join([token.text for token in final])
    print(final)
    print()


df['plagiarized_text'] = df['plagiarized_text'].map(lambda text: ' '.join([token.text for token in text if not token.is_stop and not token.is_punct]))

scientists have found a previously unknown butterfly species in the amazon jungle.
[scientists, found, previously, unknown, butterfly, species, amazon, jungle]
scientists found previously unknown butterfly species amazon jungle

our natural satellite takes around 27.3 days to complete one orbit around our planet.
[natural, satellite, takes, 27.3, days, complete, orbit, planet]
natural satellite takes 27.3 days complete orbit planet

h2o consists of 2 hydrogen atoms and 1 oxygen atom.
[h2o, consists, 2, hydrogen, atoms, 1, oxygen, atom]
h2o consists 2 hydrogen atoms 1 oxygen atom

rome has a long history that can be traced back to 753 bc.
[rome, long, history, traced, 753, bc]
rome long history traced 753 bc

in the past, pluto was classified as the ninth planet in our sun's planetary system.
[past, pluto, classified, ninth, planet, sun, planetary, system]
past pluto classified ninth planet sun planetary system

this sentence is unique and original.
[sentence, unique, original]
sentence

## Check dataframe again

In [8]:
df

Unnamed: 0,source_text,plagiarized_text,label
0,researchers discovered new species butterfly a...,scientists found previously unknown butterfly ...,1
1,moon orbits earth approximately 27.3 days,natural satellite takes 27.3 days complete orb...,1
2,water composed hydrogen atoms oxygen atom,h2o consists 2 hydrogen atoms 1 oxygen atom,1
3,history rome dates 753 bc,rome long history traced 753 bc,1
4,pluto considered ninth planet solar system,past pluto classified ninth planet sun planeta...,1
...,...,...,...
365,playing musical instruments enhances creativity,creativity enhanced playing musical instruments,0
366,studying history helps understanding present,understanding present aided studying history,0
367,listening classical music improve focus,focus improved listening classical music,0
368,practicing yoga enhances physical flexibility,physical flexibility enhanced practicing yoga,0


## Sample example of Countvectorizer 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

data = ["cat eats fish", "dog eats fish", "fish swims in water"]

vectorizer = CountVectorizer() 
e = vectorizer.fit_transform(data) # returns sparse matrix

data_df= pd.DataFrame(e.toarray(), columns = vectorizer.get_feature_names_out())

data_df

Unnamed: 0,cat,dog,eats,fish,in,swims,water
0,1,0,1,1,0,0,0
1,0,1,1,1,0,0,0
2,0,0,0,1,1,1,1


## CountVectorizer of source_text

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

q = cv.fit_transform(df['source_text'])

q_df = pd.DataFrame(q.toarray(), columns = cv.get_feature_names_out())

q_df

Unnamed: 0,000,032,080,10,100,12,13,1889,1921,1986,...,works,workshops,world,writing,wrote,year,years,yoga,york,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## CountVectorizer of plagiarized_text

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

q = cv.fit_transform(df['plagiarized_text'])

q_df = pd.DataFrame(q.toarray(), columns = cv.get_feature_names_out())

q_df

Unnamed: 0,000,032,080,10,100,12,13,1889,1921,1986,...,works,workshops,world,writing,written,year,years,yoga,york,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
368,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Sample example of TD_IDF

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = ["cat eats fish", "dog eats fish", "fish swims in water"]

vectorizer = TfidfVectorizer()
e = vectorizer.fit_transform(data)

data_df= pd.DataFrame(e.toarray(), columns = vectorizer.get_feature_names_out())

data_df



Unnamed: 0,cat,dog,eats,fish,in,swims,water
0,0.720333,0.0,0.547832,0.425441,0.0,0.0,0.0
1,0.0,0.720333,0.547832,0.425441,0.0,0.0,0.0
2,0.0,0.0,0.0,0.322745,0.546454,0.546454,0.546454


## TF-IDF of source_text

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer()

d  = vector.fit_transform(df['source_text'])

source_df = pd.DataFrame(d.toarray(), columns = vector.get_feature_names_out())

source_df

Unnamed: 0,000,032,080,10,100,12,13,1889,1921,1986,...,works,workshops,world,writing,wrote,year,years,yoga,york,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.478279,0.0,0.0


## TF_IDF of plagiarized text

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vector = TfidfVectorizer()

f = vector.fit_transform(df['plagiarized_text'])

plagiarized_df = pd.DataFrame(f.toarray(), columns = vector.get_feature_names_out())

plagiarized_df

Unnamed: 0,000,032,080,10,100,12,13,1889,1921,1986,...,works,workshops,world,writing,written,year,years,yoga,york,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.484473,0.0,0.0


## Extra columns from source_df and plagiarized_df

In [15]:
extra_columns = [i for i in plagiarized_df.columns if i not in source_df.columns]

print(extra_columns, '\n',len(extra_columns))

['abbreviated', 'achieved', 'active', 'advancement', 'affected', 'affecting', 'ai', 'aided', 'alleviated', 'allowed', 'arise', 'avoided', 'benefits', 'birthplace', 'blocked', 'boosted', 'broadened', 'building', 'capability', 'captured', 'carried', 'cause', 'causes', 'central', 'changing', 'characteristic', 'characterizes', 'clarified', 'classified', 'come', 'common', 'complete', 'completion', 'connected', 'conserved', 'contained', 'converted', 'cover', 'created', 'crucially', 'deciphered', 'deflected', 'describes', 'discharged', 'displays', 'distinguished', 'driven', 'effects', 'efficiently', 'egg', 'elevated', 'encouraged', 'engaging', 'enhanced', 'enriched', 'expanded', 'expressed', 'faced', 'facilitated', 'flying', 'follows', 'fostered', 'found', 'freezing', 'gained', 'gifted', 'given', 'h2o', 'help', 'high', 'hinges', 'improved', 'include', 'increase', 'increased', 'inspired', 'invention', 'jungle', 'landscape', 'largely', 'laying', 'lean', 'lies', 'lifted', 'live', 'lowers', 'main

## Data Preparation

In [16]:
total_text = df['source_text'] + " " + df['plagiarized_text']
print('source text:', df['source_text'][0], '\n')
print('plagiarized text: ',df['plagiarized_text'][0], '\n')

print(total_text[0], '\n')

X = vector.fit_transform(total_text)

X_df = pd.DataFrame(X.toarray(), columns = vector.get_feature_names_out())
y = df['label']

X_df

source text: researchers discovered new species butterfly amazon rainforest 

plagiarized text:  scientists found previously unknown butterfly species amazon jungle 

researchers discovered new species butterfly amazon rainforest scientists found previously unknown butterfly species amazon jungle 



Unnamed: 0,000,032,080,10,100,12,13,1889,1921,1986,...,workshops,world,writing,written,wrote,year,years,yoga,york,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.499217,0.0,0.0


## Splitting the dataset

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test , y_train , y_test = train_test_split(X, y, test_size = 0.3)

# LogisticRegression

## Model creation

In [18]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

## Model training

In [19]:
model.fit(X_train, y_train)

## Model prediction

In [20]:
y_predicted = model.predict(X_test)

y_predicted

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0], dtype=int64)

## Model metrics

In [21]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', accuracy_score(y_predicted, y_test))
print('Classification report','\n', classification_report(y_predicted, y_test))

Accuracy: 0.9009009009009009
Classification report 
               precision    recall  f1-score   support

           0       0.89      0.91      0.90        56
           1       0.91      0.89      0.90        55

    accuracy                           0.90       111
   macro avg       0.90      0.90      0.90       111
weighted avg       0.90      0.90      0.90       111



## Prediction

In [22]:
text = 'Over 60 million square miles are covered by the Pacific Ocean.'

data = []

for token in nlp(text):
    if not token.is_stop and not token.is_punct:
        data.append(token.text)

data = ' '.join(data)
S
tfidf_text = vector.transform([data])

tfidf_df = pd.DataFrame(tfidf_text.toarray(), columns = vector.get_feature_names_out())

print('\n', tfidf_df)

model.predict(tfidf_text)


    000  032  080   10  100   12   13  1889  1921  1986  ...  workshops  world  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0  ...        0.0    0.0   

   writing  written  wrote  year  years  yoga  york  zero  
0      0.0      0.0    0.0   0.0    0.0   0.0   0.0   0.0  

[1 rows x 1061 columns]


array([1], dtype=int64)

# RandomForest

## Model creation

In [23]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100)

## Model training

In [24]:
model.fit(X_train, y_train)

## Model prediction

In [25]:
y_predicted = model.predict(X_test)

y_predicted

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0], dtype=int64)

## Model metrics

In [26]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', accuracy_score(y_predicted, y_test))
print('Classification report: \n', classification_report(y_predicted, y_test))

Accuracy: 0.8198198198198198
Classification report: 
               precision    recall  f1-score   support

           0       0.95      0.76      0.84        71
           1       0.69      0.93      0.79        40

    accuracy                           0.82       111
   macro avg       0.82      0.84      0.82       111
weighted avg       0.85      0.82      0.82       111



# SVM

## Model creation

In [27]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

d = {
    'kernel' : ['poly', 'linear', 'rbf'],
    'C' : [0.1,1, 10,20,30,40,50],
    'gamma' : ['auto', 'scale']
}

gs = GridSearchCV(SVC(), d, cv = 5, return_train_score = False)

gs.fit(X_train, y_train)

df = pd.DataFrame(gs.cv_results_)

results = df[['param_C', 'param_kernel', 'param_gamma', 'mean_test_score']]

results.sort_values(by = 'mean_test_score', ascending = False)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score
10,1.0,linear,scale,0.919005
13,10.0,linear,auto,0.919005
7,1.0,linear,auto,0.919005
16,10.0,linear,scale,0.919005
40,50.0,linear,scale,0.915158
19,20.0,linear,auto,0.915158
37,50.0,linear,auto,0.915158
25,30.0,linear,auto,0.915158
34,40.0,linear,scale,0.915158
31,40.0,linear,auto,0.915158


## Model training

In [28]:
print(gs.best_params_, '\n')

from sklearn.svm import SVC

model = SVC(kernel = 'linear', gamma = 'auto', C = 1, probability = True)

model.fit(X_train, y_train)

{'C': 1, 'gamma': 'auto', 'kernel': 'linear'} 



## Model prediction

In [29]:
y_predicted = model.predict(X_test)

y_predicted

array([0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0], dtype=int64)

## Model metrics

In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Accuracy:', accuracy_score(y_predicted, y_test))
print('Classification report: \n', classification_report(y_predicted, y_test))

Accuracy: 0.8918918918918919
Classification report: 
               precision    recall  f1-score   support

           0       0.91      0.88      0.90        59
           1       0.87      0.90      0.89        52

    accuracy                           0.89       111
   macro avg       0.89      0.89      0.89       111
weighted avg       0.89      0.89      0.89       111



## Prediction

In [31]:
text = "This sentence is unique and original." 

data = []

for token in nlp(text):
    if not token.is_stop and not token.is_punct:
        data.append(token.text)

data = ' '.join(data)

tfidf_text = vector.transform([data])

tfidf_df = pd.DataFrame(tfidf_text.toarray(), columns = vector.get_feature_names_out())

print('\n', tfidf_df)

result = model.predict_proba(tfidf_text)

plag_proba = result[0][1] * 100

print("Plagiariam detected: ", plag_proba)


    000  032  080   10  100   12   13  1889  1921  1986  ...  workshops  world  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0   0.0   0.0  ...        0.0    0.0   

   writing  written  wrote  year  years  yoga  york  zero  
0      0.0      0.0    0.0   0.0    0.0   0.0   0.0   0.0  

[1 rows x 1061 columns]
Plagiariam detected:  0.1752755536366593
