In [1]:
#fetch modules
import os
import numpy
import time
import string
import warnings
from collections import Counter

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import vstack as vstack_sparse_matrices

In [2]:
warnings.filterwarnings('ignore')

In [3]:
#utility functions

def get_tokens(txt):
    tokens=word_tokenize(txt)
    return tokens

def stemming(tokens, stemmer):
    stemm =[]
    for token in tokens:
        stemm.append(stemmer.stem(token))
    return stemm

def preprocessing(txt):
    tokens = get_tokens(txt)
    stemmer = PorterStemmer()
    stems = stemming(tokens, stemmer)
    return stems

In [4]:
#files for experiment
print(os.listdir('.\\files'))

['.ipynb_checkpoints', 'files1.txt', 'files2.txt', 'files3.txt', 'files4.txt']


In [5]:
#training model
def train_Model():
    fsList =[]
    k = 0
    path =".\\files"
    vocabulary={}
    
    for subdir, dirs, files in os.walk(path):
        for file in files:
            file_path = subdir+os.path.sep+file
            f=open(file_path, 'r')
            small_case = f.read().lower()
            skip_punct = small_case.translate(string.punctuation)
            vocabulary[file] = skip_punct
            fsList.append(file)
    
    tf_idf = TfidfVectorizer(tokenizer=preprocessing, stop_words = 'english')
    corpus = vocabulary.values()
    
    tf_s = tf_idf.fit_transform(corpus)
    
    return tf_idf, tf_s, fsList    

In [6]:
tf_idf, tf_s, files= train_Model()
print(tf_s)

  (0, 45)	0.11398107370532325
  (0, 26)	0.0727526428822134
  (0, 62)	0.0727526428822134
  (0, 159)	0.2695919631306832
  (0, 10)	0.11398107370532325
  (0, 28)	0.11398107370532325
  (0, 38)	0.11398107370532325
  (0, 60)	0.11398107370532325
  (0, 150)	0.2279621474106465
  (0, 112)	0.11398107370532325
  (0, 3)	0.11398107370532325
  (0, 143)	0.11398107370532325
  (0, 33)	0.11398107370532325
  (0, 39)	0.11398107370532325
  (0, 6)	0.08986398771022773
  (0, 161)	0.11398107370532325
  (0, 71)	0.11398107370532325
  (0, 36)	0.11398107370532325
  (0, 118)	0.11398107370532325
  (0, 107)	0.11398107370532325
  (0, 146)	0.11398107370532325
  (0, 131)	0.11398107370532325
  (0, 23)	0.11398107370532325
  (0, 63)	0.11398107370532325
  (0, 162)	0.11398107370532325
  :	:
  (3, 96)	0.12412783640985797
  (3, 86)	0.12412783640985797
  (3, 69)	0.3723835092295739
  (3, 147)	0.24825567281971594
  (3, 89)	0.12412783640985797
  (3, 136)	0.24825567281971594
  (3, 99)	0.12412783640985797
  (3, 79)	0.12412783640985797

In [7]:
#verifying the input file
f=open(".\\input.txt")
files.append("input.txt")

start_ts = time.time()
txt=f.read().lower().translate(string.punctuation)
resp=tf_idf.transform([txt])
tf_s_combo = vstack_sparse_matrices([tf_s,resp])
m=(tf_s_combo*tf_s_combo.T).A
number_of_rows = len(m)
for f in range(0, len(m[0])-1):
    if (m[number_of_rows-1,f]) >0.8:
        print(f"File is plagarized with {files[f]} by {m[number_of_rows-1][f]* 100 }")

end_ts = time.time()
print("time taken ", end_ts-start_ts)

File is plagarized with files3.txt by 100.00000000000007
time taken  0.013982772827148438
