In [1]:
import os
import ast

def extract_ast_from_file(file_path):
    with open(file_path, 'r',encoding='utf-8') as f:
        source_code = f.read()
    return ast.parse(source_code)

# Example: Traverse the dataset folder
dataset_path = "E:\drdo-internship\suresh sir\dataset-2"
authors_data = {}

for author_folder in os.listdir(dataset_path):
    author_folder_path = os.path.join(dataset_path, author_folder)
    if os.path.isdir(author_folder_path):  # Check if it's a directory
        authors_data[author_folder] = []
        for file_name in os.listdir(author_folder_path):
            if file_name.endswith(".py"):  # Process only Python files
                file_path = os.path.join(author_folder_path, file_name)
                ast_tree = extract_ast_from_file(file_path)
                authors_data[author_folder].append({
                    "file_name": file_name,
                    "ast_tree": ast_tree
                })


  dataset_path = "E:\drdo-internship\suresh sir\dataset-2"


In [2]:
import ast

def extract_features_from_ast(ast_tree):
    features = {
        "num_functions": sum(isinstance(node, ast.FunctionDef) for node in ast.walk(ast_tree)),
        "num_classes": sum(isinstance(node, ast.ClassDef) for node in ast.walk(ast_tree)),
        "num_assignments": sum(isinstance(node, ast.Assign) for node in ast.walk(ast_tree)),
        "num_loops": sum(isinstance(node, (ast.For, ast.While)) for node in ast.walk(ast_tree)),
        "num_conditionals": sum(isinstance(node, ast.If) for node in ast.walk(ast_tree)),
        
        # New features
        "num_imports": sum(isinstance(node, ast.Import) for node in ast.walk(ast_tree)),
        "num_import_from": sum(isinstance(node, ast.ImportFrom) for node in ast.walk(ast_tree)),
        
        # Maximum depth of AST tree
        "max_depth": get_max_depth(ast_tree),
        
        # Count the number of lambda functions
        "num_lambda": sum(isinstance(node, ast.Lambda) for node in ast.walk(ast_tree)),
        
        # Count the number of docstrings in functions and classes
        "num_docstrings": sum(isinstance(node, ast.FunctionDef) and ast.get_docstring(node) is not None for node in ast.walk(ast_tree)) +
                          sum(isinstance(node, ast.ClassDef) and ast.get_docstring(node) is not None for node in ast.walk(ast_tree)),
        
        # Count the number of try/except blocks
        "num_try_blocks": sum(isinstance(node, ast.Try) for node in ast.walk(ast_tree)),
        
        # Count return statements
        "num_return_statements": sum(isinstance(node, ast.Return) for node in ast.walk(ast_tree)),
    }
    return features

def get_max_depth(node, depth=1):
    """Recursively compute the maximum depth of the AST."""
    if isinstance(node, ast.AST):
        return max(get_max_depth(child, depth + 1) for child in ast.iter_child_nodes(node)) if len(list(ast.iter_child_nodes(node))) > 0 else depth
    return depth

# Example: Extract features for each file
for author in authors_data:
    for file_data in authors_data[author]:
        ast_tree = file_data["ast_tree"]
        features = extract_features_from_ast(ast_tree)
        file_data["features"] = features


In [3]:
def extract_paths_from_ast(ast_tree, parent=None):
    if isinstance(ast_tree, ast.AST):
        if parent is not None:
            yield f"{parent} -> {ast_tree.__class__.__name__}"
        for child in ast.iter_child_nodes(ast_tree):
            yield from extract_paths_from_ast(child, ast_tree.__class__.__name__)

# Example: Extract paths for each file
for author in authors_data:
    for file_data in authors_data[author]:
        ast_tree = file_data["ast_tree"]
        ast_paths = list(extract_paths_from_ast(ast_tree))
        file_data["ast_paths"] = ast_paths


In [6]:
from collections import Counter

# New: Extract features from AST paths
def extract_features_from_ast_paths(ast_paths):
    path_lengths = [len(path.split(" -> ")) for path in ast_paths]
    path_counter = Counter(ast_paths)

    features = {
        "avg_path_length": sum(path_lengths) / len(path_lengths) if path_lengths else 0,
        "max_path_length": max(path_lengths) if path_lengths else 0,
        "min_path_length": min(path_lengths) if path_lengths else 0,
        "unique_paths": len(set(ast_paths)),
        "most_common_path": path_counter.most_common(1)[0][0] if path_counter else None,
        "most_common_path_count": path_counter.most_common(1)[0][1] if path_counter else 0,
    }

    return features


path_features = extract_features_from_ast_paths(ast_paths)
file_data["features"].update(path_features)

In [9]:
print(file_data)

{'file_name': 'unique.py', 'ast_tree': <ast.Module object at 0x00000260FE3FFFD0>, 'features': {'num_functions': 4, 'num_classes': 0, 'num_assignments': 9, 'num_loops': 2, 'num_conditionals': 2, 'num_imports': 0, 'num_import_from': 0, 'max_depth': 9, 'num_lambda': 0, 'num_docstrings': 0, 'num_try_blocks': 0, 'num_return_statements': 6, 'avg_path_length': 2.0, 'max_path_length': 2, 'min_path_length': 2, 'unique_paths': 36, 'most_common_path': 'Name -> Load', 'most_common_path_count': 44}, 'ast_paths': ['Module -> Expr', 'Expr -> Constant', 'Module -> Assign', 'Assign -> Name', 'Name -> Store', 'Assign -> Constant', 'Module -> Assign', 'Assign -> Name', 'Name -> Store', 'Assign -> Constant', 'Module -> FunctionDef', 'FunctionDef -> arguments', 'arguments -> arg', 'FunctionDef -> Assign', 'Assign -> Name', 'Name -> Store', 'Assign -> Call', 'Call -> Attribute', 'Attribute -> Name', 'Name -> Load', 'Attribute -> Load', 'Call -> Constant', 'Call -> Constant', 'FunctionDef -> Return', 'Return

In [10]:
import ast

def prepare_paths_for_code2seq(ast_paths, max_paths=200):
    """
    Prepares AST paths for Code2Seq in the required <start_token, path, end_token> format.

    Args:
        ast_paths (list): List of AST paths in the form "Node1 -> Node2 -> Node3".
        max_paths (int): Maximum number of paths to include per sample.

    Returns:
        list: Formatted paths for Code2Seq in the format <start_token, path, end_token>.
    """
    formatted_paths = []
    for path in ast_paths:
        # Split the path into nodes
        nodes = path.split(" -> ")
        if len(nodes) < 2:
            continue  # Skip invalid paths

        # Extract start, path, and end tokens
        start_token = tokenize_node(nodes[0])
        end_token = tokenize_node(nodes[-1])
        middle_path = '|'.join(tokenize_node(node) for node in nodes[1:-1])  # Middle part of the path

        # Format: <start_token, path, end_token>
        formatted_paths.append(f"{start_token},{middle_path},{end_token}")

    # Limit the number of paths to max_paths
    if len(formatted_paths) > max_paths:
        formatted_paths = formatted_paths[:max_paths]

    return formatted_paths

def tokenize_node(node_name):
    """
    Tokenizes a node name into sub-tokens. Example: "FunctionDef" -> "Function Def".

    Args:
        node_name (str): The name of the AST node.

    Returns:
        str: Tokenized node name.
    """
    tokens = []
    current_token = ""
    for char in node_name:
        if char.isupper() and current_token:
            tokens.append(current_token)
            current_token = char
        else:
            current_token += char
    if current_token:
        tokens.append(current_token)
    return '_'.join(tokens).lower()

# Example: Prepare Code2Seq input for each file
code2seq_data = []
for author in authors_data:
    for file_data in authors_data[author]:
        ast_paths = file_data["ast_paths"]
        formatted_paths = prepare_paths_for_code2seq(ast_paths)

        # Example label for Code2Seq (e.g., filename or author)
        label = file_data.get("label", author)

        # Add to Code2Seq dataset
        code2seq_data.append({
            "label": label,
            "paths": formatted_paths
        })

# Save to file in Code2Seq format
with open("code2seq_input.txt", "w", encoding="utf-8") as f:
    for sample in code2seq_data:
        label = sample["label"]
        paths = " ".join(sample["paths"])
        f.write(f"{label} {paths}\n")


In [11]:
print(code2seq_data)

[{'label': 'Alice Gana', 'paths': ['module,,import_from', 'import_from,,alias', 'module,,import_from', 'import_from,,alias', 'module,,import_from', 'import_from,,alias', 'module,,import', 'import,,alias', 'module,,assign', 'assign,,name', 'name,,store', 'assign,,call', 'call,,name', 'name,,load', 'module,,expr', 'expr,,call', 'call,,attribute', 'attribute,,name', 'name,,load', 'attribute,,load', 'call,,constant', 'module,,assign', 'assign,,name', 'name,,store', 'assign,,constant', 'module,,assign', 'assign,,name', 'name,,store', 'assign,,call', 'call,,attribute', 'attribute,,name', 'name,,load', 'attribute,,load', 'call,,name', 'name,,load', 'module,,expr', 'expr,,call', 'call,,attribute', 'attribute,,name', 'name,,load', 'attribute,,load', 'call,,constant', 'module,,expr', 'expr,,call', 'call,,attribute', 'attribute,,name', 'name,,load', 'attribute,,load', 'call,,constant', 'module,,assign', 'assign,,name', 'name,,store', 'assign,,call', 'call,,attribute', 'attribute,,name', 'name,,lo

In [12]:
from collections import Counter

def build_vocabulary(data):
    token_counter = Counter()
    for sample in data:
        for path in sample["paths"]:
            tokens = path.replace('|', ',').split(',')  # Split tokens
            token_counter.update(tokens)

    vocab = {token: idx + 1 for idx, token in enumerate(token_counter.keys())}  # Start indexing from 1
    vocab["<PAD>"] = 0  # Padding token
    return vocab

vocab = build_vocabulary(code2seq_data)
print("Vocabulary:", vocab)


Vocabulary: {'module': 1, '': 2, 'import_from': 3, 'alias': 4, 'import': 5, 'assign': 6, 'name': 7, 'store': 8, 'call': 9, 'load': 10, 'expr': 11, 'attribute': 12, 'constant': 13, 'list': 14, 'function_def': 15, 'arguments': 16, 'bin_op': 17, 'add': 18, 'global': 19, 'sub': 20, 'while': 21, 'compare': 22, 'eq': 23, 'div': 24, 'return': 25, 'tuple': 26, 'arg': 27, 'mult': 28, 'not_eq': 29, 'for': 30, 'joined_str': 31, 'formatted_value': 32, 'if': 33, 'unary_op': 34, 'not': 35, 'bool_op': 36, 'or': 37, 'subscript': 38, 'slice': 39, 'u_sub': 40, 'gt_e': 41, 'aug_assign': 42, 'mod': 43, 'gt': 44, 'lt': 45, 'keyword': 46, 'dict': 47, 'break': 48, 'class_def': 49, 'in': 50, 'pow': 51, 'try': 52, 'except_handler': 53, 'lt_e': 54, 'list_comp': 55, 'comprehension': 56, 'and': 57, 'assert': 58, 'generator_exp': 59, 'bit_and': 60, 'with': 61, 'withitem': 62, 'is': 63, 'lambda': 64, 'bit_or': 65, 'raise': 66, 'not_in': 67, 'is_not': 68, 'pass': 69, 'continue': 70, 'starred': 71, 'if_exp': 72, 'yie

In [13]:
def encode_paths(paths, vocab, max_path_length=10):
    encoded_paths = []
    for path in paths:
        tokens = path.split(',')  # Split the path into tokens
        token_indices = [vocab.get(token, vocab["<PAD>"]) for token in tokens]
        if len(token_indices) < max_path_length:
            token_indices += [vocab["<PAD>"]] * (max_path_length - len(token_indices))  # Pad
        else:
            token_indices = token_indices[:max_path_length]  # Truncate
        encoded_paths.append(token_indices)
    return encoded_paths


In [14]:
def encode_labels(data):
    labels = {sample["label"] for sample in data}
    label_to_index = {label: idx for idx, label in enumerate(labels)}
    for sample in data:
        sample["encoded_label"] = label_to_index[sample["label"]]
    return label_to_index

label_to_index = encode_labels(code2seq_data)
print("Label to Index Mapping:", label_to_index)


Label to Index Mapping: {'AMAN KHARWAL': 0, 'Mushinako': 1, 'Vincent Russo': 2, 'Kamyu': 3, 'Kenneth Reitz Archive': 4, 'Imad': 5, 'Nikhil Mahajan': 6, 'phitronio': 7, 'Samuel': 8, 'Jaredliw': 9, 'Armin Ronacher': 10, 'Donne Martin': 11, 'Garvit Bansal': 12, 'Sebastian Castañeda': 13, 'Tuhin Mitra': 14, 'Mahamudul Hasan Mithhu': 15, 'Subham Das': 16, 'Tushar Nankani': 17, 'David Beazley': 18, 'Alice Gana': 19}


In [15]:
def prepare_dataset(data, vocab, max_paths=200, max_path_length=10):
    for sample in data:
        encoded_paths = encode_paths(sample["paths"], vocab, max_path_length)
        if len(encoded_paths) < max_paths:
            empty_path = [vocab["<PAD>"]] * max_path_length
            encoded_paths += [empty_path] * (max_paths - len(encoded_paths))  # Pad paths
        else:
            encoded_paths = encoded_paths[:max_paths]  # Truncate paths
        sample["final_encoded_paths"] = encoded_paths
    return data

max_paths = 200
max_path_length = 10
code2seq_data = prepare_dataset(code2seq_data, vocab, max_paths, max_path_length)


In [16]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(code2seq_data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")


Train: 280, Val: 60, Test: 60


In [17]:
import torch

def convert_to_tensors(data):
    inputs, labels = [], []
    for sample in data:
        inputs.append(sample["final_encoded_paths"])  # Encoded paths
        labels.append(sample["encoded_label"])  # Encoded label
    inputs_tensor = torch.tensor(inputs, dtype=torch.long)  # Long for embeddings
    labels_tensor = torch.tensor(labels, dtype=torch.long)  # Long for classification
    return inputs_tensor, labels_tensor

train_inputs, train_labels = convert_to_tensors(train_data)
val_inputs, val_labels = convert_to_tensors(val_data)
test_inputs, test_labels = convert_to_tensors(test_data)


In [18]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 32

train_dataset = TensorDataset(train_inputs, train_labels)
val_dataset = TensorDataset(val_inputs, val_labels)
test_dataset = TensorDataset(test_inputs, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Code2SeqFeatureExtractor(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes):
        super(Code2SeqFeatureExtractor, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        
        # GRU layer
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        
        # Feature extraction layers
        self.lexical_fc = nn.Linear(hidden_size, 128)  # Lexical feature layer
        self.syntactic_fc = nn.Linear(hidden_size, 128)  # Syntactic feature layer
        self.semantic_fc = nn.Linear(hidden_size, 128)  # Semantic feature layer
        self.layout_fc = nn.Linear(hidden_size, 128)  # Layout feature layer
        
        # Final classification layer
        self.fc = nn.Linear(128 * 4, num_classes)  # Concatenate features
        
    def forward(self, x):
        batch_size, num_paths, path_length = x.size()
        
        # Flatten input for embedding
        x = x.view(batch_size * num_paths, path_length)
        
        # Embedding layer
        embedded = self.embedding(x)  # Shape: (batch_size * num_paths, path_length, embed_size)
        
        # GRU layer
        _, hidden = self.gru(embedded)  # hidden: (1, batch_size * num_paths, hidden_size)
        hidden = hidden.squeeze(0)  # Shape: (batch_size * num_paths, hidden_size)
        
        # Extract features
        lexical_features = F.relu(self.lexical_fc(hidden))
        syntactic_features = F.relu(self.syntactic_fc(hidden))
        semantic_features = F.relu(self.semantic_fc(hidden))
        layout_features = F.relu(self.layout_fc(hidden))
        
        # Concatenate features
        combined_features = torch.cat([lexical_features, syntactic_features, semantic_features, layout_features], dim=-1)
        
        # Reshape combined features back to batch size
        combined_features = combined_features.view(batch_size, num_paths, -1)
        
        # Aggregate path-level features (mean pooling across all paths)
        aggregated_features = combined_features.mean(dim=1)  # Shape: (batch_size, feature_dim)
        
        # Classification
        output = self.fc(aggregated_features)  # Shape: (batch_size, num_classes)
        
        return output, combined_features  # Return output and extracted features


In [20]:
# Hyperparameters
vocab_size = len(vocab)
embed_size = 128
hidden_size = 256
num_classes = len(label_to_index)
num_epochs = 10
learning_rate = 0.001

# Initialize model, loss function, and optimizer
model = Code2SeqFeatureExtractor(vocab_size, embed_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch_inputs, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs, _ = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

# Validation loop
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch_inputs, batch_labels in val_loader:
        outputs, _ = model(batch_inputs)
        _, predicted = torch.max(outputs, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    print(f"Validation Accuracy: {100 * correct / total:.2f}%")


Epoch 1/10, Loss: 3.0362
Epoch 2/10, Loss: 2.9904
Epoch 3/10, Loss: 3.0411
Epoch 4/10, Loss: 2.9926
Epoch 5/10, Loss: 2.9694
Epoch 6/10, Loss: 2.8884
Epoch 7/10, Loss: 2.6362
Epoch 8/10, Loss: 2.6918
Epoch 9/10, Loss: 2.5412
Epoch 10/10, Loss: 2.7177
Validation Accuracy: 11.67%


In [21]:
# Feature extraction example
model.eval()
extracted_features = []
labels = []
with torch.no_grad():
    for batch_inputs, batch_labels in train_loader:
        _, features = model(batch_inputs)  # Extract features
        extracted_features.append(features.numpy())
        labels.append(batch_labels.numpy())

# Convert to arrays
import numpy as np
extracted_features = np.concatenate(extracted_features, axis=0)
labels = np.concatenate(labels, axis=0)


In [22]:
print(extracted_features)

[[[0.5822367  0.         1.8872985  ... 0.         0.         0.        ]
  [0.5578647  0.         1.9862777  ... 0.         0.         0.        ]
  [0.66438174 0.         1.9724022  ... 0.         0.         0.        ]
  ...
  [0.42959622 0.         1.5324093  ... 0.         0.         0.        ]
  [0.42959622 0.         1.5324093  ... 0.         0.         0.        ]
  [0.42959622 0.         1.5324093  ... 0.         0.         0.        ]]

 [[0.         0.         1.2875683  ... 0.         0.         0.8496124 ]
  [0.         0.         0.8620445  ... 0.         0.         0.8673807 ]
  [1.6387086  0.6687832  0.         ... 0.48728248 0.01482338 0.        ]
  ...
  [0.42959622 0.         1.5324093  ... 0.         0.         0.        ]
  [0.42959622 0.         1.5324093  ... 0.         0.         0.        ]
  [0.42959622 0.         1.5324093  ... 0.         0.         0.        ]]

 [[0.         0.04726515 0.         ... 0.04758558 0.         2.2993746 ]
  [0.         0.119408

In [24]:
import numpy as np

# Flatten each feature array
flattened_features = np.array([features.flatten() for features in extracted_features])

print(flattened_features.shape)  # This should be (n_samples, n_features)


(280, 102400)


In [26]:
import numpy as np

# Calculate the maximum number of paths dynamically
MAX_PATHS = int(np.percentile([len(paths) for paths in extracted_features], 95))
print("MAX_PATHS:", MAX_PATHS)


MAX_PATHS: 200


In [27]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to the maximum number of paths across samples
padded_features = pad_sequences(
    extracted_features, maxlen=MAX_PATHS, padding='post', dtype='float32'
)

# Reshape into 2D for normalization
reshaped_features = padded_features.reshape(len(padded_features), -1)


In [28]:
# Normalize reshaped features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(reshaped_features)


In [29]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    normalized_features, labels, test_size=0.2, random_state=42
)


In [30]:
from sklearn.svm import SVC

# Initialize the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)

# Train the SVM model
svm_model.fit(X_train, y_train)

print("SVM model trained successfully!")


SVM model trained successfully!


In [31]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 32.14%
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.67      1.00      0.80         2
           2       0.00      0.00      0.00         5
           3       0.67      0.57      0.62         7
           4       0.25      0.67      0.36         3
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         4
           7       0.50      0.67      0.57         3
           8       0.00      0.00      0.00         0
           9       0.33      1.00      0.50         2
          10       0.00      0.00      0.00         3
          11       1.00      1.00      1.00         3
          12       0.00      0.00      0.00         2
          13       0.25      0.33      0.29         3
          14       0.00      0.00      0.00         3
          15       0.33      0.33      0.33         3
          16       0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    normalized_features, labels, test_size=0.2, random_state=42
)

# Encode the labels into one-hot format
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Convert labels to one-hot encoding
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_train_one_hot = one_hot_encoder.fit_transform(y_train_encoded.reshape(-1, 1))
y_test_one_hot = one_hot_encoder.transform(y_test_encoded.reshape(-1, 1))

# Define the DNN model
model = Sequential([
    Dense(256, activation='relu', input_dim=X_train.shape[1]),
    Dropout(0.3),  # Dropout to prevent overfitting
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(y_train_one_hot.shape[1], activation='softmax')  # Output layer
])

# Compile the model
model.compile(
    optimizer='adam',  # Adam optimizer for efficient training
    loss='categorical_crossentropy',  # Loss for multi-class classification
    metrics=['accuracy']  # Track accuracy during training
)

# Train the model
history = model.fit(
    X_train, y_train_one_hot,
    validation_split=0.2,  # Reserve part of training data for validation
    epochs=20,  # Number of epochs
    batch_size=32,  # Mini-batch size
    verbose=1  # Display training progress
)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test_one_hot, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Ensure the classes are strings
class_names = [str(cls) for cls in label_encoder.classes_]

# Generate predictions
y_pred = np.argmax(model.predict(X_test), axis=1)

# Print classification report with proper class names
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred, target_names=class_names))



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 811ms/step - accuracy: 0.0903 - loss: 22.7280 - val_accuracy: 0.1778 - val_loss: 20.4098
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 420ms/step - accuracy: 0.1848 - loss: 38.4686 - val_accuracy: 0.1778 - val_loss: 24.7184
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 375ms/step - accuracy: 0.1499 - loss: 38.8890 - val_accuracy: 0.1556 - val_loss: 20.2214
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 376ms/step - accuracy: 0.2415 - loss: 31.5683 - val_accuracy: 0.1778 - val_loss: 19.6953
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 391ms/step - accuracy: 0.1914 - loss: 33.1132 - val_accuracy: 0.0889 - val_loss: 17.3436
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 375ms/step - accuracy: 0.2604 - loss: 24.8885 - val_accuracy: 0.1556 - val_loss: 18.1263
Epoch 7/20
[1m6/6[0m [32m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
# Pad the sequences (tokenized paths)
padded_features = pad_sequences(extracted_features, maxlen=MAX_PATH_LENGTH, padding='post', dtype='float32')

# Ensure padded_features is 2D (batch_size, sequence_length)
print(f"Shape of padded_features: {padded_features.shape}")  # Check the shape

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(padded_features, encoded_labels, test_size=0.2, random_state=42)

# Build LSTM Model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_PATH_LENGTH))

# LSTM Layer (will accept data with shape: [batch_size, sequence_length, embedding_dim])
model.add(LSTM(64, return_sequences=False))  # 64 units, no need for return_sequences as we only need final output

# Dropout for regularization
model.add(Dropout(0.2))

# Fully Connected Layer
model.add(Dense(64, activation='relu'))

# Output Layer (Softmax for multi-class classification)
model.add(Dense(NUM_CLASSES, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'Test Accuracy: {test_acc * 100:.2f}%')

# Generate predictions and classification report
y_pred = np.argmax(model.predict(X_test), axis=1)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Shape of padded_features: (280, 100, 512)
Epoch 1/10




ValueError: Input 0 of layer "lstm_5" is incompatible with the layer: expected ndim=3, found ndim=4. Full shape received: (32, 100, 512, 50)

In [42]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


In [43]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)


In [44]:
X_train, X_test, y_train, y_test = train_test_split(extracted_features, y_encoded, test_size=0.2, random_state=42)


In [45]:
lda = LinearDiscriminantAnalysis()


In [47]:
X_train_2d = X_train.reshape(X_train.shape[0], -1)  # Flatten each sample into a vector
X_test_2d = X_test.reshape(X_test.shape[0], -1)


In [48]:
lda.fit(X_train_2d, y_train)


In [50]:
y_pred = lda.predict(X_test_2d)

target_names = [str(cls) for cls in label_encoder.classes_]
print(classification_report(y_test, y_pred, target_names=target_names))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.67      1.00      0.80         2
           2       1.00      0.20      0.33         5
           3       0.40      0.29      0.33         7
           4       0.50      0.33      0.40         3
           5       0.00      0.00      0.00         2
           6       1.00      0.25      0.40         4
           7       0.67      0.67      0.67         3
           8       0.00      0.00      0.00         0
           9       1.00      1.00      1.00         2
          10       1.00      0.33      0.50         3
          11       0.75      1.00      0.86         3
          12       0.00      0.00      0.00         2
          13       0.17      0.33      0.22         3
          14       0.20      0.33      0.25         3
          15       0.29      0.67      0.40         3
          16       0.33      0.33      0.33         3
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [53]:
jupyter nbconvert --to script Untitled-Copy1.ipynb


SyntaxError: invalid syntax (3185636809.py, line 1)