In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, KFold

In [None]:
df = pd.read_csv('MalwareData.csv', sep='|')
df.head(10)

In [None]:
df = df.drop(columns=['Name', 'md5'])

In [None]:
# Check for na values
df.isna().sum()

In [None]:
# Check unique values if any column has only 1
for c in df.columns:
    print(c + ": {} unique values".format(len(df[c].unique())))

In [None]:
#dict = {332:"x86", 34404 : "x64", 512:"IA64"}
#df['machine_pe'] = ""
#for index, row in df.iterrows():
#    val = row['Machine']
#    df.at[index, 'machine_pe'] = dict[val]
#df = df.drop(columns = ['Machine'])
#df = pd.get_dummies(df, columns = ['machine_pe'], dtype='int')
#df

In [None]:
# Separate features and target variable
X = df.drop(columns=['legitimate'])

# standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = df['legitimate']

In [None]:
def run_model(model_type, model, X, y):
    print(model_type, "classifier:")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)    # Fit the classifier
    
    start_time = time.time()  # Start time

    model.fit(X_train, y_train)
    
    end_time = time.time()  # End time
    time_taken = end_time - start_time  # Time taken to run the code

    print(f"Time taken to run the code: {time_taken} seconds")
    
    # Make predictions
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot()
    plt.show()
    # print(classification_report(y_test,y_pred))
    
    #K-Fold validation 
    num_folds = 5
    
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    
    cv_scores = cross_val_score(model, X, y, cv=kf)
    
    print("Cross-validation scores:", cv_scores)
    print("Mean CV accuracy:", cv_scores.mean())
    print("Standard deviation of CV accuracy:", cv_scores.std())

In [None]:
correlation_matrix = X.corr()
# Set up the figure size
f, ax = plt.subplots(figsize=(32, 32))
# Create a heatmap using seaborn
sns.heatmap(correlation_matrix, cmap='coolwarm', linewidth=.5, ax=ax)
# Set the title and show the plot
plt.title('Correlation Matrix')
plt.show()

In [None]:
cols_to_drop = []
for i in X.columns:
    corrs = [0 if x == 1 else abs(x) for x in correlation_matrix[i]]
    if max(corrs) > 0.6:
        cols_to_drop.append(i)
print(cols_to_drop)
print(len(cols_to_drop))

In [None]:
X_lr = X.drop(columns=cols_to_drop)
X_scaled_lr = scaler.fit_transform(X_lr)

In [None]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)
run_model('Logistic Regression', lr_model, X_scaled_lr, y)

In [None]:
selector_uv = SelectKBest(f_classif, k=10)
X_uv = selector_uv.fit_transform(X_scaled, y)
print(X_uv.shape)
print(selector_uv.get_feature_names_out(input_features=df.columns[:-1]))
rf_uv = DecisionTreeClassifier(random_state=42)
run_model('Decision Tree', rf_uv, X_uv, y)

In [None]:
model_rfe = DecisionTreeClassifier(random_state=42)
selector_rfe = RFE(model_rfe, n_features_to_select=10)
selector_rfe = selector_rfe.fit(X_scaled, y)

In [None]:
selector_rfe.get_feature_names_out(input_features=df.columns[:-1])

In [None]:
X_rfe = selector_rfe.transform(X_scaled)
print(X_rfe.shape)
dt_rfe = DecisionTreeClassifier(random_state=42)
run_model('Decision Tree', dt_rfe, X_rfe, y)

In [None]:
clf = ExtraTreesClassifier(random_state=42, max_features = 15).fit(X_scaled, y)
model = SelectFromModel(clf, prefit=True)
X_tb = model.transform(X_scaled)
print(X_tb.shape)

In [None]:
# See which columns are selected
cols = X_tb.shape[1]
inds = np.argsort(clf.feature_importances_)[::-1][:cols]
for i in range(cols):
    print(df.columns[inds[i]], clf.feature_importances_[inds[i]])

plt.figure(figsize=(12, 12))
plt.bar(X.columns, clf.feature_importances_)
plt.xticks(rotation=90)
plt.xlabel('Feature Labels')
plt.ylabel('Feature Importances')
plt.title('Comparison of different Feature Importances')
plt.show()

In [None]:
dt_tb = DecisionTreeClassifier(random_state=42)
run_model('Decision Tree', dt_tb, X_tb, y)

In [None]:
# Train test split
#X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.20)

In [None]:
#models = {}
#models['Logistic Regression'] = LogisticRegression(random_state=42)
#models['Decision Trees'] = DecisionTreeClassifier(random_state=42)
#models['Random Forest'] = RandomForestClassifier(random_state=42)
#models['K-Nearest Neighbor'] = KNeighborsClassifier(n_neighbors = 5)
#models['SVM'] = svm.SVC(kernel='linear')
#models['Gaussian Naive Bayes'] = GaussianNB()