In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
class ModelWrapper:
    def __init__(self, selected_features, n_components=0.95, random_state=42):
        """Initialize PCA, Scaler, and Random Forest Model."""
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.model = xgb.XGBClassifier(
    n_estimators=100,       # Assuming 100 instead of 'lee'
    max_depth=7,
    learning_rate=0.2,      # Assumed value (tunable)
    subsample=0.8,          # Corrected from 'e.S' to 0.8
    colsample_bytree=0.8    # Corrected from 'e 8' to 0.8
)
        self.selected_features = selected_features

    def fit(self, X_train, y_train):
        """Fits the scaler, PCA, and trains the Random Forest model."""
        print("\n=== Starting Model Training ===")

        # Scale and apply PCA
        X_train_scaled = self.scaler.fit_transform(X_train[self.selected_features])


        # Train Random Forest
        print("\n=== Training Model ===")
        self.model.fit(X_train_scaled, y_train)

    def transform(self, X):
        """Applies Scaling and PCA transformation to new data."""
        X_scaled = self.scaler.transform(X[self.selected_features])
        return X_scaled

    def predict(self, X):
        """Predicts using the trained Random Forest model."""
        rf_preds = self.model.predict(self.transform(X))
        return rf_preds

    def evaluate(self, X_test, y_test):
        """Evaluates model accuracy for the Random Forest model."""
        rf_preds = self.predict(X_test)
        rf_acc = accuracy_score(y_test, rf_preds)

        print(f"\nRandom Forest Accuracy: {rf_acc:.4f}")
        return rf_acc

    def save(self, filename="model.pkl"):
        """Saves the entire pipeline (Scaler, PCA, Random Forest) as a pickle file."""
        with open(filename, "wb") as f:
            pickle.dump(self, f)
        print(f"\n✅ Model saved as '{filename}'")

    @staticmethod
    def load(filename="model.pkl"):
        """Loads a trained model from a pickle file."""
        with open(filename, "rb") as f:
            model = pickle.load(f)
        print(f"\n✅ Model loaded from '{filename}'")
        return model


In [None]:
df = pd.read_csv('/kaggle/working/merged_selected_features.csv')
X = df.drop(columns=['Type', 'Unnamed: 0', 'SHA256', 'Unnamed: 0.1'])
y = df['Type']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model

In [None]:
sel_feature_list = np.load("/kaggle/input/sel-featurelist/selected_features.npy")
rf_model = ModelWrapper(sel_feature_list)
rf_model.fit(X_train, y_train)
rf_model.evaluate(X_test, y_test)

In [None]:
rf_model.save(filename="xgb_model.pkl")

# Loading the saved model

In [None]:
model = ModelWrapper.load("/kaggle/working/xgb_model.pkl")

In [None]:
test_df= pd.read_csv("/kaggle/input/malware-detection/test.csv")

In [None]:
X = test_df.drop(columns=["SHA256"])



In [None]:
test_pred = model.predict(X)
test_pred = pd.concat([test_df['SHA256'], pd.DataFrame(test_pred)], axis=1, ignore_index=True)

In [None]:
test_pred.columns = ['SHA256', 'pred']
test_pred.to_csv("test_result.csv")