In [4]:
import findspark

findspark.init()

In [15]:
import difflib
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

text_file = open("source", "r")
source = text_file.read()
text_file.close()


def precise_analysis(file):
    return difflib.SequenceMatcher(None, source, file).ratio()

def quick_analysis(file):
    return difflib.SequenceMatcher(None, source, file).quick_ratio()
    
spark = SparkSession.builder.getOrCreate()

In [16]:
# to read parquet file
df = spark.read.parquet('dataset.parquet').limit(10)

df.show()

+------+------+--------------------+--------------------+--------------------+
|UserId|FileId|          Repository|            FileName|         FileContent|
+------+------+--------------------+--------------------+--------------------+
|      |      |10se1ucgo-Disable...|              dwt.py|import logging
im...|
|      |      |10se1ucgo-Disable...|        dwt_about.py|import datetime
i...|
|      |      |10se1ucgo-Disable...|         dwt_util.py|import logging
im...|
|      |      |2020PB-police-bru...|     data_builder.py|import csv
import...|
|      |      |2020PB-police-bru...|    data_rewriter.py|import random
imp...|
|      |      |2020PB-police-bru...|test_data_builder.py|import pytest
fro...|
|      |      |2020PB-police-bru...|test_process_md_t...|import pytest
fro...|
|      |      |2020PB-police-bru...|test_text_formatt...|import pytest
fro...|
|      |      |2020PB-police-bru...|         __init__.py|                    |
|      |      |2020PB-police-bru...|   text_formatte

In [17]:
quick_analysis_udf = udf(quick_analysis)

In [18]:
df = df.withColumn("diff_quick", quick_analysis_udf(df["FileContent"]))
df = df.filter(df["diff_quick"] > 0.5)
df.show(10)

+------+------+--------------------+---------------+--------------------+------------------+
|UserId|FileId|          Repository|       FileName|         FileContent|        diff_quick|
+------+------+--------------------+---------------+--------------------+------------------+
|      |      |10se1ucgo-Disable...|         dwt.py|import logging
im...|0.9995836700707761|
|      |      |10se1ucgo-Disable...|   dwt_about.py|import datetime
i...|0.5776732604786278|
|      |      |10se1ucgo-Disable...|    dwt_util.py|import logging
im...|0.6948773710281503|
|      |      |2020PB-police-bru...|data_builder.py|import csv
import...|0.6523483149616247|
+------+------+--------------------+---------------+--------------------+------------------+



In [19]:
precise_analysis_udf = udf(precise_analysis)

In [20]:
df = df.withColumn("diff_exact", precise_analysis_udf(df["FileContent"]))
df = df.filter(df["diff_exact"] > 0.5)
df.show(10)

+------+------+--------------------+--------+--------------------+------------------+------------------+
|UserId|FileId|          Repository|FileName|         FileContent|        diff_quick|        diff_exact|
+------+------+--------------------+--------+--------------------+------------------+------------------+
|      |      |10se1ucgo-Disable...|  dwt.py|import logging
im...|0.9995836700707761|0.7847574265912375|
+------+------+--------------------+--------+--------------------+------------------+------------------+



In [23]:
def get_diff(file):
    return str(difflib.SequenceMatcher(None, source, file).get_matching_blocks())

get_diff_udf = udf(get_diff)

In [24]:
df = df.withColumn("matching_blocks", get_diff_udf(df["FileContent"]))

df.show(10)

+------+------+--------------------+--------+--------------------+------------------+------------------+--------------------+
|UserId|FileId|          Repository|FileName|         FileContent|        diff_quick|        diff_exact|     matching_blocks|
+------+------+--------------------+--------+--------------------+------------------+------------------+--------------------+
|      |      |10se1ucgo-Disable...|  dwt.py|import logging
im...|0.9995836700707761|0.7847574265912375|[Match(a=8, b=23,...|
+------+------+--------------------+--------+--------------------+------------------+------------------+--------------------+



In [38]:
from collections import namedtuple

Match = namedtuple("Match", "a b size")

print("Detected pragiatrism:")
     
verbose = True

for row in df.collect():
    matching_blocks = eval(row["matching_blocks"])
    file_repo = "{}/{}".format(row["Repository"], row["FileName"])
    for match in matching_blocks:
        print("Sequence in {} from byte {} = sequence from {} in source file, length = {}".format(file_repo, match.a, match.b, match.size))
        if verbose:
            print("\x1b[31m\"{}\"\x1b[0m".format(source[match.a:match.a + match.size]))

Detected pragiatrism:
Sequence in 10se1ucgo-DisableWinTracking.tar.gz/dwt.py from byte 8 = sequence from 23 in source file, length = 1
[31m"y"[0m
Sequence in 10se1ucgo-DisableWinTracking.tar.gz/dwt.py from byte 334 = sequence from 93 in source file, length = 2
[31m"ty"[0m
Sequence in 10se1ucgo-DisableWinTracking.tar.gz/dwt.py from byte 1971 = sequence from 111 in source file, length = 448
[31m"l
from six import u
import wx
from wx.lib.itemspicker import ItemsPicker, IP_SORT_SELECTED, IP_SORT_CHOICES, IP_REMOVE_FROM_CHOICES
import dwt_about
import dwt_util
class RedirectText(io.StringIO):
    def __init__(self, console, old_stdout):
        super(RedirectText, self).__init__()
        self.out = console
        self.old_out = old_stdout
    def write(self, string):
        self.old_out.write(string)
        self.out.WriteText(string)
"[0m
Sequence in 10se1ucgo-DisableWinTracking.tar.gz/dwt.py from byte 2419 = sequence from 1822 in source file, length = 5479
[31m"
class MainFrame(