In [1]:
import re
import os
import pandas as pd
import numpy as np

# Define the feature extraction function (as shown earlier)
def extract_features(code_segment):
    lines = code_segment.split("\n")
    num_lines = len(lines)
    
    total_chars = 0
    total_words = 0
    comment_lines = 0
    identifier_lengths = []
    inline_spaces_tabs = 0
    trailing_spaces_tabs = 0
    indent_spaces_tabs = 0
    underscore_count = 0

    identifier_regex = re.compile(r'\b\w+\b')
    comment_regex = re.compile(r'#|/\*|\*/')
    
    for line in lines:
        stripped_line = line.strip()
        total_chars += len(line)
        total_words += len(stripped_line.split())
        
        if comment_regex.search(stripped_line):
            comment_lines += 1

        identifiers = identifier_regex.findall(stripped_line)
        identifier_lengths.extend([len(ident) for ident in identifiers])
        underscore_count += sum(ident.count('_') for ident in identifiers)

        inline_spaces_tabs += len(re.findall(r'[ \t]', stripped_line))
        trailing_spaces_tabs += len(re.findall(r'[ \t]+$', line))
        indent_spaces_tabs += len(re.findall(r'^[ \t]+', line))

    avg_line_length = total_chars / num_lines if num_lines > 0 else 0
    avg_line_words = total_words / num_lines if num_lines > 0 else 0
    avg_identifier_length = np.mean(identifier_lengths) if identifier_lengths else 0
    comments_frequency = comment_lines / num_lines if num_lines > 0 else 0

    return {
        "line_length": avg_line_length,
        "line_words": avg_line_words,
        "comments_frequency": comments_frequency,
        "identifier_length": avg_identifier_length,
        "inline_spaces_tabs": inline_spaces_tabs,
        "trailing_spaces_tabs": trailing_spaces_tabs,
        "indent_spaces_tabs": indent_spaces_tabs,
        "underscores": underscore_count,
    }

# Traverse the dataset directory and extract features
def process_dataset(dataset_path):
    data = []
    for author in os.listdir(dataset_path):
        author_path = os.path.join(dataset_path, author)
        if os.path.isdir(author_path):  # Check if it's a directory
            for file_name in os.listdir(author_path):
                file_path = os.path.join(author_path, file_name)
                if file_name.endswith('.py'):  # Only process Python files
                    try:
                        with open(file_path, "r", encoding="utf-8") as file:
                            code = file.read()
                            features = extract_features(code)
                            features["Author"] = author  # Add the author label
                            data.append(features)
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")
    return pd.DataFrame(data)

# Define dataset path
dataset_path = r"E:\drdo-internship\akash-paper4\data\authorship_attribution"

# Process the dataset
features_df = process_dataset(dataset_path)

# Save the processed features to a CSV file
features_df.to_csv("processed_dataset.csv", index=False)
print("Feature extraction completed. Saved to 'processed_dataset.csv'.")


Feature extraction completed. Saved to 'processed_dataset.csv'.


In [2]:
# Load the processed dataset
df = pd.read_csv("processed_dataset.csv")

# Separate features and labels
X = df.drop(columns=["Author"])  # Features
y = df["Author"]  # Labels

# Convert labels to one-hot encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1)).toarray()


from sklearn.model_selection import train_test_split

# Split data into training (67%) and testing (33%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.15, random_state=42)


In [3]:
import numpy as np

# Convert y_train and y_test from one-hot encoding to class indices
y_train_classes = np.argmax(y_train, axis=1)
y_test_classes = np.argmax(y_test, axis=1)


In [48]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load the processed dataset
df = pd.read_csv("processed_dataset.csv")

# Separate features and labels
X = df.drop(columns=["Author"])  # Features
y = df["Author"]  # Labels

# Convert labels to one-hot encoding
encoder = OneHotEncoder()
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1)).toarray()

# Split data into training (67%) and testing (33%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.15, random_state=42)

# Define the Deep Neural Network (DNN) model
model = Sequential()

# Input layer and first fully connected layer
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))  # Input dimension from the number of features
model.add(Dropout(0.5))  # Dropout for regularization

# Hidden layers with ReLU activation and Dropout layers
for _ in range(5):  # 7 additional fully connected layers
    model.add(Dense(64, activation='tanh'))
    model.add(Dropout(0.3))

# Output layer with softmax activation for multi-class classification
model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Predicting the probabilities for the test set (to be used in the ensemble)
y_pred_prob = model.predict(X_test)

# Optionally, convert predictions to class labels
y_pred_classes = np.argmax(y_pred_prob, axis=1)

# For a base classifier, you would use `y_pred_prob` in your ensemble system


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.0102 - loss: 4.3929 - val_accuracy: 0.0190 - val_loss: 4.2773
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0145 - loss: 4.3548 - val_accuracy: 0.0190 - val_loss: 4.2837
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0154 - loss: 4.4239 - val_accuracy: 0.0095 - val_loss: 4.2848
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0203 - loss: 4.3926 - val_accuracy: 0.0190 - val_loss: 4.3251
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0148 - loss: 4.3510 - val_accuracy: 0.0000e+00 - val_loss: 4.3338
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0318 - loss: 4.3503 - val_accuracy: 0.0000e+00 - val_loss: 4.3631
Epoch 7/10
[1m19/19[0m [32m━━━━━━━━━━━━

In [49]:
from sklearn.metrics import accuracy_score

# Predict using each base classifier
y_pred_clf1 = model.predict(X_test)

y_pred_clf1 = np.argmax(y_pred_clf1, axis=1)  # For probabilistic outputs

# Calculate accuracy for each base classifier
acc_clf1 = accuracy_score(y_test_classes, y_pred_clf1)

print(f"Accuracy of DNN: {acc_clf1:.2f}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
Accuracy of DNN: 0.00


In [22]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest (CART-based) model
rf_cart = RandomForestClassifier(criterion='gini', n_estimators=100)
rf_cart.fit(X_train, y_train_classes)


In [None]:
y_pred_clf2 = rf_cart.predict(X_test)

acc_clf2 = accuracy_score(y_test_classes, y_pred_clf2)

print(f"Accuracy of rf-crt: {acc_clf2:.2f}")

In [23]:
# Train the Random Forest (C4.5-based) model
rf_c45 = RandomForestClassifier(criterion='entropy', n_estimators=100)
rf_c45.fit(X_train, y_train_classes)


In [None]:
y_pred_clf3 = rf_c45.predict(X_test)

acc_clf3 = accuracy_score(y_test_classes, y_pred_clf3)

print(f"Accuracy of rf-c45: {acc_clf3:.2f}")

In [24]:
from sklearn.svm import SVC

# Initialize and train the C-SVM model
cs_svm = SVC(probability=True)  # Enable probability estimates
cs_svm.fit(X_train, y_train_classes)


In [None]:
y_pred_clf4 = cs_svm.predict(X_test)

acc_clf4 = accuracy_score(y_test_classes, y_pred_clf4)

print(f"Accuracy of cs-svm: {acc_clf4:.2f}")

In [25]:
from sklearn.svm import NuSVC

# Initialize and train the ν-SVM model
nu_svm = NuSVC(kernel='rbf', nu=0.1, probability=True)  # Enable probability estimates
nu_svm.fit(X_train, y_train_classes)


In [26]:

y_pred_clf5 = nu_svm.predict(X_test)

acc_clf5 = accuracy_score(y_test_classes, y_pred_clf5)

# Print accuracies

print(f"Accuracy of nu-sv,: {acc_clf5:.2f}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Accuracy of DNN: 0.01
Accuracy of rf-crt: 0.30
Accuracy of rf-c45: 0.26
Accuracy of cs-svm: 0.01
Accuracy of nu-sv,: 0.16


In [9]:
# Get probability predictions from each base classifier
prob_rf_cart = rf_cart.predict_proba(X_train)  # Random Forest (CART-based)
prob_rf_c45 = rf_c45.predict_proba(X_train)  # Random Forest (C4.5-based)
prob_cs_svm = cs_svm.predict_proba(X_train)  # C-SVM
prob_nu_svm = nu_svm.predict_proba(X_train)  # ν-SVM

# Get probability predictions from the Deep Neural Network (DNN)
prob_dnn = model.predict(X_train)  # DNN


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [10]:
# Combine these probabilities into a new feature set (meta-features)
meta_features = np.hstack([prob_rf_cart, prob_rf_c45, prob_cs_svm, prob_nu_svm, prob_dnn])

# Get probability predictions from the base classifiers on the test set
prob_rf_cart_test = rf_cart.predict_proba(X_test)
prob_rf_c45_test = rf_c45.predict_proba(X_test)
prob_cs_svm_test = cs_svm.predict_proba(X_test)
prob_nu_svm_test = nu_svm.predict_proba(X_test)

# Get probability predictions from the Deep Neural Network (DNN) on the test set
prob_dnn_test = model.predict(X_test)

# Combine predictions from all classifiers for the meta-classifier input
meta_features_test = np.hstack([prob_rf_cart_test, prob_rf_c45_test, prob_cs_svm_test, prob_nu_svm_test, prob_dnn_test])
'''
# Make predictions using the meta-classifier
y_pred = meta_model.predict(meta_features_test)


meta_features_test = np.hstack([prob_rf_cart_test, prob_rf_c45_test, prob_cs_svm_test, prob_nu_svm_test, prob_dnn_test])'''


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step


'\n# Make predictions using the meta-classifier\ny_pred = meta_model.predict(meta_features_test)\n\n\nmeta_features_test = np.hstack([prob_rf_cart_test, prob_rf_c45_test, prob_cs_svm_test, prob_nu_svm_test, prob_dnn_test])'

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
import numpy as np

# Assuming you have already trained your base classifiers and obtained the probabilities (meta-features)
# Example: meta_features is a numpy array of shape (num_samples, num_meta_features)
# Example: meta_features.shape = (231, 5 * num_classes) if 5 classifiers with 'num_classes' outputs each

# Define the Meta-Classifying DNN model
meta_model = Sequential()

# Input layer: expects the number of meta-features as input
meta_model.add(Dense(128, input_dim=meta_features.shape[1], activation='relu'))

# Add 8 fully connected layers with ReLU activation and Dropout layers
for _ in range(8):
    meta_model.add(Dense(128, activation='relu'))
    meta_model.add(Dropout(0.5))

# Add two fully connected layers before the final output layer
meta_model.add(Dense(128, activation='relu'))
meta_model.add(Dense(128, activation='relu'))

# Add dropout layer after the fully connected layers
meta_model.add(Dropout(0.5))

# Add fully connected layers with dropout as per the architecture description
meta_model.add(Dense(128, activation='relu'))
meta_model.add(Dropout(0.5))

# Output layer with softmax activation for multi-class classification
meta_model.add(Dense(y_train.shape[1], activation='softmax'))

# Compile the model using Stochastic Gradient Descent (SGD)
meta_model.compile(loss='categorical_crossentropy', optimizer=SGD(), metrics=['accuracy'])

# Train the meta-classifier
meta_model.fit(meta_features, y_train, epochs=10, batch_size=32, validation_data=(meta_features_test, y_test))

# Predict the final output (class label) for the test set
final_output = meta_model.predict(meta_features_test)

# Convert final output probabilities to class labels
final_class_labels = np.argmax(final_output, axis=1)

# The final output (final_class_labels) is the prediction of the author for each code sample


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.0105 - loss: 4.2506 - val_accuracy: 0.0000e+00 - val_loss: 4.2489
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0150 - loss: 4.2533 - val_accuracy: 0.0000e+00 - val_loss: 4.2493
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0196 - loss: 4.2490 - val_accuracy: 0.0000e+00 - val_loss: 4.2496
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0125 - loss: 4.2493 - val_accuracy: 0.0000e+00 - val_loss: 4.2500
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.0216 - loss: 4.2515 - val_accuracy: 0.0000e+00 - val_loss: 4.2502
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0076 - loss: 4.2494 - val_accuracy: 0.0000e+00 - val_loss: 4.2506
Epoch 7/10


In [18]:
meta_features_test

array([[0.01      , 0.        , 0.14      , ..., 0.01411631, 0.01490927,
        0.01420087],
       [0.        , 0.        , 0.11      , ..., 0.01411593, 0.01490857,
        0.01420033],
       [0.        , 0.        , 0.        , ..., 0.01411718, 0.01490861,
        0.01420151],
       ...,
       [0.        , 0.16      , 0.        , ..., 0.01412011, 0.01490871,
        0.01420427],
       [0.        , 0.05      , 0.        , ..., 0.01411599, 0.01490857,
        0.01420038],
       [0.01      , 0.        , 0.        , ..., 0.01411906, 0.01490868,
        0.01420329]])

In [12]:
# Evaluate the meta-classifier on the test set
loss, accuracy = meta_model.evaluate(meta_features_test, y_test)

# Print the accuracy
print(f"Accuracy of the meta-classifier: {accuracy * 100:.2f}%")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0000e+00 - loss: 4.2514
Accuracy of the meta-classifier: 0.00%
