In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


final


In [None]:
pip install tabulate




In [None]:
import os
import ast
import difflib
import io
import sys
from tabulate import tabulate

def get_file_paths(directory, extension=".py"):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(extension):
                file_paths.append(os.path.join(root, file))
    return file_paths

def count_lines(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    return len(lines)

def has_duplicate_lines(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        lines = file.readlines()
    line_set = set()
    duplicate_lines = set()

    for line_number, line in enumerate(lines, start=1):
        stripped_line = line.strip()
        if stripped_line and stripped_line in line_set:
            duplicate_lines.add(line_number)
        else:
            line_set.add(stripped_line)

    return duplicate_lines

def detect_duplicates_in_folder(folder_path):
    file_paths = get_file_paths(folder_path)
    all_duplicate_lines = set()

    for file_path in file_paths:
        duplicate_lines = has_duplicate_lines(file_path)
        all_duplicate_lines.update((file_path, line_number) for line_number in duplicate_lines)

    return all_duplicate_lines

def extract_functions(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        code = file.read()
    try:
        tree = ast.parse(code)
    except SyntaxError as e:
        print(f"Error parsing {file_path}: {e}")
        return []

    functions = []
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            functions.append((node.name, ast.dump(node)))

    return functions

def function_similarity(func1, func2):
    seq = difflib.SequenceMatcher(None, func1, func2)
    return seq.ratio()

def capture_function_output(file_path, function_name):
    # Redirect stdout to capture function output
    old_stdout = sys.stdout
    new_stdout = io.StringIO()
    sys.stdout = new_stdout

    # Import the module dynamically
    module = {}
    with open(file_path, "r", encoding="utf-8") as file:
        code = file.read()
    try:
        exec(compile(code, file_path, 'exec'), module)
    except Exception as e:
        print(f"Error executing {file_path}: {e}")
        return ""

    # Get the function and call it
    try:
        function = module.get(function_name)
        if function:
            function_output = function()
            return str(function_output)
    except Exception as e:
        print(f"Error executing function {function_name} in {file_path}: {e}")

    # Restore stdout
    sys.stdout = old_stdout
    return ""

def detect_duplicate_functions(folder_path, similarity_threshold=0.8):
    file_paths = get_file_paths(folder_path)
    all_functions = {}
    duplicate_functions = set()

    for file_path in file_paths:
        functions = extract_functions(file_path)
        for function_name, function_ast in functions:
            signature = f"{file_path}:{function_name}"
            for existing_signature, existing_ast in all_functions.items():
                if function_similarity(function_ast, existing_ast) > similarity_threshold:
                    output1 = capture_function_output(file_path, function_name)
                    output2 = capture_function_output(existing_signature.split(":")[0], existing_signature.split(":")[1])
                    if output1 == output2:
                        duplicate_functions.add(signature)
                        duplicate_functions.add(existing_signature)
            all_functions[signature] = function_ast

    return duplicate_functions

def tokenize_code(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        code = file.read()
    # Tokenize using a more sophisticated method, e.g., using the tokenize module
    # For simplicity, we'll just split by whitespace for now
    tokens = code.split()
    return tokens

def compare_code(file1_path, file2_path):
    tokens1 = tokenize_code(file1_path)
    tokens2 = tokenize_code(file2_path)

    # Calculate similarity ratio using difflib's SequenceMatcher
    seq = difflib.SequenceMatcher(None, tokens1, tokens2)
    similarity_ratio = seq.ratio()

    return similarity_ratio

def detect_duplicates(directory, similarity_threshold=0.8):
    file_paths = get_file_paths(directory)
    duplicates = []

    for i in range(len(file_paths)):
        for j in range(i + 1, len(file_paths)):
            file1_path = file_paths[i]
            file2_path = file_paths[j]

            similarity_ratio = compare_code(file1_path, file2_path)

            if similarity_ratio > similarity_threshold:
                duplicates.append((file1_path, file2_path, similarity_ratio))

    return duplicates

def print_results(title, data):
    print(f"\n{title}\n")
    if data:
        print(tabulate(data, headers="keys", tablefmt="fancy_grid"))
    else:
        print("No items found.")

# Set your folder path
folder_path = "/content/drive/MyDrive/duplicate_code_detection"





# Detect potential duplicate code pairs
duplicate_pairs = detect_duplicates(folder_path)
if duplicate_pairs:
    print_results("Potential duplicate code pairs found:", [{"File 1": file1, "File 2": file2, "Similarity": similarity} for file1, file2, similarity in duplicate_pairs])
else:
    print_results("No potential duplicate code pairs found.", [])



# Detect duplicate lines
all_duplicates = detect_duplicates_in_folder(folder_path)
if all_duplicates:
    print_results("Duplicate non-blank lines found across files:", [{"File Path": file_path, "Line Number": line_number} for file_path, line_number in all_duplicates])
else:
    print_results("No duplicate non-blank lines found across files.", [])



# Detect duplicate functions
duplicate_functions = detect_duplicate_functions(folder_path)
if duplicate_functions:
    print_results("Possible duplicate functions found:", [{"Function Signature": function_signature} for function_signature in duplicate_functions])
else:
    print_results("No possible duplicate functions found.", [])


Potential duplicate code pairs found:

╒═════════════════════════════════════════════════════════════════════════════╤═════════════════════════════════════════════════════════════════════════════════╤══════════════╕
│ File 1                                                                      │ File 2                                                                          │   Similarity │
╞═════════════════════════════════════════════════════════════════════════════╪═════════════════════════════════════════════════════════════════════════════════╪══════════════╡
│ /content/drive/MyDrive/duplicate_code_detection/duplicate_code_detection.py │ /content/drive/MyDrive/duplicate_code_detection/duplicate_code_detection (1).py │            1 │
╘═════════════════════════════════════════════════════════════════════════════╧═════════════════════════════════════════════════════════════════════════════════╧══════════════╛

Duplicate non-blank lines found across files:

╒══════════════════════════