In [36]:
# ======================================================
# FINAL INTELLIGENT ERROR ANALYZER (HYBRID SYSTEM)
# ======================================================

import kagglehub
import pandas as pd
import numpy as np
import os
import re
import subprocess
import tempfile

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# ======================================================
# 1Ô∏è‚É£ DATA PREPARATION
# ======================================================

path = kagglehub.dataset_download("stackoverflow/pythonquestions")
questions_path = os.path.join(path, "Questions.csv")

df = pd.read_csv(
    questions_path,
    encoding="latin1",
    low_memory=False
)

df = df[["Body"]].copy()

def remove_html(text):
    return re.sub(r"<.*?>", "", str(text))

df["clean_body"] = df["Body"].apply(remove_html)

ERROR_TYPES = [
    "SyntaxError","TypeError","NameError","IndexError",
    "IndentationError","ZeroDivisionError","KeyError",
    "AttributeError","ValueError","ImportError",
    "ModuleNotFoundError","FileNotFoundError",
    "UnboundLocalError"
]

pattern = "|".join(ERROR_TYPES)
mask = df["clean_body"].str.contains(pattern, na=False)
df_errors = df[mask].copy()

def extract_error(text):
    for err in ERROR_TYPES:
        match = re.search(rf"{err}:.*", text)
        if match:
            return match.group()
    return None

df_errors["error_message"] = df_errors["clean_body"].apply(extract_error)
df_errors = df_errors.dropna(subset=["error_message"])

def get_error_label(msg):
    for err in ERROR_TYPES:
        if err in msg:
            return err
    return None

df_errors["error_label"] = df_errors["error_message"].apply(get_error_label)
df_final = df_errors[["error_message","error_label"]].dropna()

print("Dataset size:", len(df_final))

# ======================================================
# 2Ô∏è‚É£ TRAIN ERROR TYPE MODEL
# ======================================================

X = df_final["error_message"]
y_error = df_final["error_label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_error,
    test_size=0.2,
    random_state=42,
    stratify=y_error
)

error_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = error_vectorizer.fit_transform(X_train)
X_test_vec = error_vectorizer.transform(X_test)

error_model = LogisticRegression(max_iter=1000)
error_model.fit(X_train_vec, y_train)

print("Error Model Accuracy:",
      accuracy_score(y_test, error_model.predict(X_test_vec)))

# ======================================================
# 3Ô∏è‚É£ EXECUTION ENGINE
# ======================================================

def execute_code(code):
    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
        f.write(code)
        filename = f.name

    result = subprocess.run(
        ["python", filename],
        capture_output=True,
        text=True
    )

    return result.stderr.strip()

# ======================================================
# 4Ô∏è‚É£ PREDICTION FUNCTION
# ======================================================

def predict_error_type(error_text):
    vec = error_vectorizer.transform([error_text])
    return error_model.predict(vec)[0]

# ======================================================
# 5Ô∏è‚É£ INTELLIGENT FIX ENGINE
# ======================================================

def generate_fix(error_type, error_message):

    # 1Ô∏è‚É£ Built-in typo suggestion
    suggestion_match = re.search(r"Did you mean: '(.+?)'\?", error_message)
    if suggestion_match:
        suggestion = suggestion_match.group(1)
        return f"It looks like a typo. Did you mean '{suggestion}'?"

    # 2Ô∏è‚É£ SyntaxError
    if error_type == "SyntaxError":
        if "unterminated string literal" in error_message:
            return "Add a closing quotation mark."
        if "was never closed" in error_message:
            return "Close the missing bracket or parenthesis."
        if "expected ':'" in error_message:
            return "Add ':' at the end of the statement."
        return "Check syntax carefully."

    # 3Ô∏è‚É£ IndentationError
    if error_type == "IndentationError":
        if "expected an indented block" in error_message:
            return "Indent the next line after the loop or function."
        if "unexpected indent" in error_message:
            return "Remove extra indentation."
        return "Fix indentation using consistent 4 spaces."

    # 4Ô∏è‚É£ NameError
    if error_type == "NameError":
        match = re.search(r"name '(.+?)' is not defined", error_message)
        if match:
            variable = match.group(1)
            return f"The variable '{variable}' is not defined. Define it before using."
        return "Make sure all variables are defined."

    # 5Ô∏è‚É£ UnboundLocalError
    if error_type == "UnboundLocalError":
        return "The variable is referenced before assignment inside the function."

    # 6Ô∏è‚É£ TypeError
    if error_type == "TypeError":
        if "concatenate str" in error_message:
            return "Convert integer using str() before concatenation."
        if "unsupported operand type" in error_message:
            return "Ensure operands are compatible types."
        if "positional argument" in error_message:
            return "Check number of arguments passed to the function."
        return "Check data types used in operation."

    # 7Ô∏è‚É£ ZeroDivisionError
    if error_type == "ZeroDivisionError":
        return "You are dividing by zero. Add a condition before division."

    # 8Ô∏è‚É£ IndexError
    if error_type == "IndexError":
        return "List index is out of range. Check list length."

    # 9Ô∏è‚É£ KeyError
    if error_type == "KeyError":
        return "The dictionary key does not exist. Check key before accessing."

    # üîü AttributeError
    if error_type == "AttributeError":
        return "The object does not have this attribute. Check method name."

    # 1Ô∏è‚É£1Ô∏è‚É£ ValueError
    if error_type == "ValueError":
        return "Invalid value passed to function. Check input format."

    # 1Ô∏è‚É£2Ô∏è‚É£ ModuleNotFoundError
    if error_type == "ModuleNotFoundError":
        match = re.search(r"No module named '(.+?)'", error_message)
        if match:
            module = match.group(1)
            return f"The module '{module}' is not installed. Install using pip."
        return "Install required module using pip."

    # 1Ô∏è‚É£3Ô∏è‚É£ FileNotFoundError
    if error_type == "FileNotFoundError":
        return "The file path is incorrect or file does not exist."

    return "Review your code carefully."

# ======================================================
# 6Ô∏è‚É£ FULL ANALYSIS FUNCTION
# ======================================================

def analyze_code(code):
    error_log = execute_code(code)

    if not error_log:
        return {
            "status": "success",
            "message": "Code executed successfully."
        }

    error_type = predict_error_type(error_log)
    fix = generate_fix(error_type, error_log)

    return {
        "error_log": error_log,
        "predicted_type": error_type,
        "suggested_fix": fix
    }

# ======================================================
# 7Ô∏è‚É£ TEST CASES
# ======================================================

print("\n--- TEST 1 ---")
print(analyze_code('prin("a")'))

print("\n--- TEST 2 ---")
print(analyze_code('5/0'))

print("\n--- TEST 3 ---")
print(analyze_code('a = "5" + 5'))

print("\n--- TEST 4 ---")
print(analyze_code('print(x)'))

print("\n--- TEST 5 ---")
print(analyze_code('for cat in categories:'))

print("\n--- TEST 6 ---")
print(analyze_code('import numppy'))

print("\n--- TEST 7 ---")
print(analyze_code('open(\"missing.txt\")'))


Using Colab cache for faster access to the 'pythonquestions' dataset.
Dataset size: 68368
Error Model Accuracy: 0.9982448442299254

--- TEST 1 ---
{'error_log': 'Traceback (most recent call last):\n  File "/tmp/tmpr70rgybt.py", line 1, in <module>\n    prin("a")\n    ^^^^\nNameError: name \'prin\' is not defined. Did you mean: \'print\'?', 'predicted_type': 'NameError', 'suggested_fix': "It looks like a typo. Did you mean 'print'?"}

--- TEST 2 ---
{'error_log': 'Traceback (most recent call last):\n  File "/tmp/tmpemm5s4ft.py", line 1, in <module>\n    5/0\n    ~^~\nZeroDivisionError: division by zero', 'predicted_type': 'ZeroDivisionError', 'suggested_fix': 'You are dividing by zero. Add a condition before division.'}

--- TEST 3 ---
{'error_log': 'Traceback (most recent call last):\n  File "/tmp/tmpjwhi0aj8.py", line 1, in <module>\n    a = "5" + 5\n        ~~~~^~~\nTypeError: can only concatenate str (not "int") to str', 'predicted_type': 'TypeError', 'suggested_fix': 'Convert integ

In [37]:
import pickle
import os

os.makedirs("models", exist_ok=True)

pickle.dump(error_model, open("models/error_classifier.pkl","wb"))
pickle.dump(error_vectorizer, open("models/error_vectorizer.pkl","wb"))

print("Models saved successfully.")


Models saved successfully.
