In [1]:
import pandas as pd

# Load the dataset
file_path = "feature_vectors_syscallsbinders_frequency_5_Cat.csv"
df = pd.read_csv(file_path)

# Basic dataset inspection
info = df.info()
head = df.head()
missing_values = df.isnull().sum()
duplicate_rows = df.duplicated().sum()

info, head, missing_values, duplicate_rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11598 entries, 0 to 11597
Columns: 471 entries, ACCESS_PERSONAL_INFO___ to Class
dtypes: int64(471)
memory usage: 41.7 MB


(None,
    ACCESS_PERSONAL_INFO___  ALTER_PHONE_STATE___  ANTI_DEBUG_____  \
 0                        1                     0                0   
 1                        3                     0                0   
 2                        2                     0                0   
 3                        1                     0                0   
 4                        3                     0                0   
 
    CREATE_FOLDER_____  CREATE_PROCESS`_____  CREATE_THREAD_____  \
 0                   3                     0                  14   
 1                   6                     0                  42   
 2                   4                     0                  23   
 3                   4                     0                  27   
 4                  11                     0                  18   
 
    DEVICE_ACCESS_____  EXECUTE_____  FS_ACCESS____  FS_ACCESS()____  ...  \
 0                   2             0              3                0  ...   
 1     

In [3]:
df_cleaned = df.drop_duplicates()


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
import joblib

# =============================================
# 1. Load and Preprocess Data
# =============================================
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)

    # Remove duplicates and missing values
    df = df.drop_duplicates().dropna()

    # Class labels (convert to 0-based indexing)
    y = df['Class'] - 1  # <- fix here: convert [1–5] to [0–4]
    X = df.drop(columns=['Class'])

    # Drop sparse features (>95% zeros)
    sparse_features = [col for col in X.columns if (X[col] == 0).mean() > 0.95]
    X = X.drop(columns=sparse_features)
    print(f"Removed {len(sparse_features)} sparse features")

    return X, y, sparse_features

# =============================================
# 2. Evaluate the Model
# =============================================
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))

# =============================================
# 3. Train Model
# =============================================
def train_xgboost_model(X, y):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )

    print("Class distribution in y_train before SMOTE:", Counter(y_train))

    # SMOTE supports multi-class
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    print("Class distribution after SMOTE:", Counter(y_train))

    # XGBoost classifier
    xgb = XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y)),
        eval_metric='mlogloss',
        random_state=42,
        use_label_encoder=False,
        n_jobs=-1
    )

    params = {
        'n_estimators': [100],
        'max_depth': [5],
        'learning_rate': [0.1]
    }

    grid = GridSearchCV(
        estimator=xgb,
        param_grid=params,
        cv=StratifiedKFold(n_splits=3),
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )

    print("Starting grid search...")
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    print("Best Params:", grid.best_params_)

    evaluate_model(best_model, X_test, y_test)

    return best_model, scaler

# =============================================
# 4. Risk Scorer Class
# =============================================
class RiskScorer:
    def __init__(self, model_path='xgb_risk_model.pkl', scaler_path='risk_scaler.pkl', sparse_path='sparse_features.pkl'):
        self.model = joblib.load(model_path)
        self.scaler = joblib.load(scaler_path)
        self.sparse_features = joblib.load(sparse_path)

    def predict_risk(self, features_dict):
        df = pd.DataFrame([features_dict])

        for col in self.sparse_features:
            df[col] = 0

        df.drop(columns=self.sparse_features, errors='ignore', inplace=True)
        scaled = self.scaler.transform(df)
        predicted_class = self.model.predict(scaled)[0]

        return predicted_class + 1  # back to original label (1–5)

# =============================================
# 5. Run Everything
# =============================================
if __name__ == "__main__":
    print("Loading and preprocessing data...")
    X, y, sparse_features = load_and_preprocess_data('feature_vectors_syscallsbinders_frequency_5_Cat.csv')

    print("\nTraining model...")
    model, scaler = train_xgboost_model(X, y)

    # Save model artifacts
    joblib.dump(model, 'xgb_risk_model.pkl')
    joblib.dump(scaler, 'risk_scaler.pkl')
    joblib.dump(sparse_features, 'sparse_features.pkl')
    print("\n✅ Model and scaler saved successfully!")

    # Sample test
    scorer = RiskScorer()
    sample_input = X.iloc[0].to_dict()
    risk_class = scorer.predict_risk(sample_input)
    print(f"\nPredicted Risk Class: {risk_class}")


Loading and preprocessing data...
Removed 295 sparse features

Training model...
Class distribution in y_train before SMOTE: Counter({2: 3122, 3: 2027, 1: 1635, 4: 1434, 0: 1002})
Class distribution after SMOTE: Counter({3: 3122, 4: 3122, 2: 3122, 1: 3122, 0: 3122})
Starting grid search...
Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.



Best Params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.93      0.90       251
           1       0.96      0.92      0.94       409
           2       0.97      0.98      0.98       781
           3       0.95      0.92      0.93       507
           4       0.93      0.93      0.93       358

    accuracy                           0.95      2306
   macro avg       0.93      0.94      0.94      2306
weighted avg       0.95      0.95      0.95      2306

Confusion Matrix:
 [[234   3   3   6   5]
 [  8 376   8   7  10]
 [  2   6 769   4   0]
 [ 20   4   4 467  12]
 [  7   3   5   9 334]]
Accuracy Score: 0.9453599306157849

✅ Model and scaler saved successfully!

Predicted Risk Class: 1


  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[col] = 0
  df[c

In [7]:
import pandas as pd


# Step 1: Load dataset and drop label column
df = pd.read_csv('feature_vectors_syscallsbinders_frequency_5_Cat.csv')
df = df.drop_duplicates().dropna()
X = df.drop(columns=['Class'])

# Step 2: Create a RiskScorer instance
scorer = RiskScorer()

# Step 3: Take one sample from actual data (e.g., first row)
sample_features = X.iloc[0].to_dict()

# Step 4: Predict
predicted_risk_class = scorer.predict_risk(sample_features)
print("Predicted Risk Class:", predicted_risk_class)


Predicted Risk Class: 1


In [29]:
import xgboost as xgb
import pandas as pd
import numpy as np

# X, y should be your APK feature matrix and target risk class
# Example: features = ['uses_camera', 'uses_sms', ...]
model = xgb.XGBClassifier()
model.fit(X, y)

# Save the trained model
model.save_model("risk_model.json")  # JSON is preferred for portability
print("✅ XGBoost model saved to risk_model.json")


✅ XGBoost model saved to risk_model.json
