In [None]:
#importing all the necessary and allowed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Introduction to the Problem
The goal of this model is to predict a system’s probability of getting infected by various families of malware, based on different properties of that system. The telemetry data containing these properties and the system infections was generated by threat reports collected by system's antivirus software.

# Data Loading
We will now load the dataset and inspect to make sure that everything is loaded correctly like all the columns are present etc

In [None]:
file_path = '/kaggle/input/System-Threat-Forecaster'

train = pd.read_csv('/kaggle/input/System-Threat-Forecaster/train.csv')
test = pd.read_csv('/kaggle/input/System-Threat-Forecaster/test.csv')
sub = pd.read_csv('/kaggle/input/System-Threat-Forecaster/sample_submission.csv')

train.columns

# Exploratory Data Analysis

Feature types and number of missing values, we will impute these when we make preprocessing pipelines.

In [None]:
train.info()

In [None]:
(train.isnull().mean() * 100).sort_values(ascending=False)

Since the number of missing values is vert few we will handle missing values by imputing as opposed to dropping since we can preserve rows by doing so.

# Feature Engineering

Here 
'DateAS' : Malware signature dates ,
'DateOS' : timestamps for OSVersion which gives the time that the OS was last updated

The new feature represents the age of new system's age after updating

In [None]:
train['DateAS'] = pd.to_datetime(train['DateAS'])
train['DateOS'] = pd.to_datetime(train['DateOS'])
train['SystemAge'] = (train['DateAS'] - train['DateOS']).dt.days

test['DateAS'] = pd.to_datetime(test['DateAS'])
test['DateOS'] = pd.to_datetime(test['DateOS'])
test['SystemAge'] = (test['DateAS'] - test['DateOS']).dt.days

train.drop(columns=['DateAS'], inplace=True)
test.drop(columns=['DateAS'], inplace=True)
train.drop(columns=['DateOS'], inplace=True)
test.drop(columns=['DateOS'], inplace=True)


In [None]:
#violin plot of the new dataset
sns.violinplot(x='SystemAge', data = train)
plt.xlabel('SystemAge')
plt.title('Violin Plot of SystemAge')
plt.show()


Such a distribution indicates popular chasis' probably had some new update at specific times quite close to the date that the data was collected.

Note that a negative age indicates that a malware was detected before the OS was updated, although we do not find as many instances of this occuring. (approximately 1.5%)

In [None]:
(train['SystemAge'].value_counts()<0).count()/len(train['SystemAge'])
#dont need to check for null values since we know there are none

In [None]:
train['target'].value_counts().plot(kind='bar')
plt.title("Distribution of Target")
plt.show()


Dataset is fairly balenced doesnt require SMOTE

In [None]:
sns.boxplot(x=train['ProcessorCoreCount'])
plt.title("ProcessorCoreCount Boxplot")
plt.show()


Most machines have 2 or 4 cores (which is typical for consumer devices) quite a few have 12 and some upto 60

In [None]:
sns.countplot(x='IsSystemProtected', hue='target', data=train)
plt.title("IsSystemProtected vs Target")
plt.show()


Most machines have some sort of protection software intalled. Unusually, protected systems have a slightly higher rate of getting infected.


In [None]:
sns.countplot(x='HasOpticalDiskDrive', hue='target', data=train)
plt.title("HasOpticalDiskDrive vs Target")
plt.show()


Most machines don't have an optical disk drive.

In [None]:
sns.boxplot(x='target', y='TotalPhysicalRAMMB', data=train)
plt.title("TotalPhysicalRAMMB vs Target")
plt.show()


In [None]:
top_versions = train['EngineVersion'].value_counts().nlargest(10).index
sns.countplot(x='EngineVersion', data=train[train['EngineVersion'].isin(top_versions)])
plt.title("Top 10 EngineVersion")
plt.xticks(rotation=45)
plt.show()


most of the engine versions are ```1.1.15200.1``` and ```1.1.15100.1```

In [None]:
top_10_countries = train['CountryID'].value_counts().nlargest(10).index
sns.countplot(x='CountryID', data=train[train['CountryID'].isin(top_10_countries)])
plt.title("Top 10 CountryID Distribution")
plt.show()


In [None]:
train['ChassisType'].value_counts()
#most machines are Notebooks

# Preprocessing

In [None]:
# dropping columns that dont have more than one unique values
cols_to_drop = ['IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled', 'MachineID']
train.drop(columns=cols_to_drop, inplace=True)
test.drop(columns=cols_to_drop, inplace=True)


In [None]:
# cheking if the columns are dropped
train.columns

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
#replaces all categories but top n (based on frequency) to 'Other' 
class TopNEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, n):
        self.n = n
        self.top_n_ = None

    def fit(self, X, y=None):
        X = self._convert_to_series(X)
        freq = X.value_counts()
        self.top_n_ = freq.nlargest(self.n).index
        return self

    def transform(self, X):
        X = self._convert_to_series(X)
        return X.where(X.isin(self.top_n_), other='Other').values.reshape(-1, 1)

    def _convert_to_series(self, X):
        """
        Converts X to a one-dimensional pandas Series.
        If X is (n_samples, 1) or (n_samples,), make it a Series.
        """
        if isinstance(X, pd.DataFrame):
            if X.shape[1] != 1:
                raise ValueError("TopNEncoder expects a single column DataFrame.")
            X = X.iloc[:, 0]  # get the single column as a Series
        elif isinstance(X, np.ndarray):
            # Flatten if it's (n_samples, 1)
            if len(X.shape) == 2 and X.shape[1] == 1:
                X = X.ravel()
            # Now X is shape (n_samples,)
            X = pd.Series(X)
        else:
            # If it's already a Series or some other type, you may need more checks
            # but typically Series or array are the main cases.
            pass
        return X

top_n_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('top_n', TopNEncoder(n=10)),  # or n=15, etc.
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

In [None]:
from sklearn.preprocessing import StandardScaler
numeric_median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_mode_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

binary_cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])


In [None]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_map_ = {}

    def fit(self, X, y=None):
        X = self._convert_to_series(X)
        freq = X.value_counts() / len(X)
        self.freq_map_ = freq.to_dict()
        return self

    def transform(self, X):
        X = self._convert_to_series(X)
        return X.map(self.freq_map_).fillna(0).values.reshape(-1, 1)

    def _convert_to_series(self, X):
        if isinstance(X, pd.DataFrame):
            if X.shape[1] != 1:
                raise ValueError("FrequencyEncoder expects a single column.")
            X = X.iloc[:, 0]
        elif isinstance(X, np.ndarray):
            if len(X.shape) == 2 and X.shape[1] == 1:
                X = X.ravel()
            X = pd.Series(X)
        return X

id_freq_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('freq_encoder', FrequencyEncoder())
])


In [None]:
transformers = [] 
numeric_cols_median = [
    'NumAntivirusProductsInstalled',
    'SystemAge',
    'OSBuildRevisionOnly',
    'PrimaryDisplayResolutionVertical',
    'SystemVolumeCapacityMB',
    'PrimaryDisplayResolutionHorizontal',
    'OSBuildNumber',
    'PrimaryDisplayDiagonalInches',
    'OSBuildNumberOnly',
    'PrimaryDiskCapacityMB',
    'ProcessorCoreCount',
    'TotalPhysicalRAMMB',
    'InternalBatteryNumberOfCharges',
    'NumAntivirusProductsEnabled',
]

ordinal = [
    
    
]

binary_cols = [
    'IsAlwaysOnAlwaysConnectedCapable',
    'IsTouchEnabled',
    'IsPassiveModeEnabled',
    'IsVirtualDevice',
    'IsPenCapable',
    'SMode',
    'IsSecureBootEnabled',
    'IsPortableOS',
    'HasTpm',
    'FirewallEnabled',
    'HasOpticalDiskDrive',
    'IsGamer',
    'IsSystemProtected',
    'DeviceFamily',
    'ProductName'  #only has two values
]


id_type = [
    'FirmwareManufacturerID',
    'OEMNameID',
    'OEMModelID',
    'FirmwareVersionID',
    'LocaleEnglishNameID',
    'IEVersionID',
    'ProcessorModelID',
    'AntivirusConfigID',
    'OSInstallLanguageID',
    'OSUILocaleID'
]

for col in id_type:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)
for col in id_type:
    transformers.append(
        (f"id_freq_{col}", id_freq_pipeline, [col])
    )


cat_mode = [
    'OSProductSuite',
    'EnableLUA', # 3 unique values only
    'ProcessorManufacturerID', #has very few (4) unique values
    'RealTimeProtectionState',
    'FlightRing',
    'OSVersion',
    'OSEdition',
    'OSBranch',
    'LicenseActivationChannel',
    'OsPlatformSubRelease',
    'PowerPlatformRole',
    'Processor',
    'AutoUpdateOptionsName',
    'PrimaryDiskType',
    'OSSkuFriendlyName',
    'OSArchitecture',
    'OSInstallType',
    'MDC2FormFactor',
]

#selected columns have very few values in the class of <top n categories
top_n = [
    'CityID',
    'NumericOSVersion',
    'SignatureVersion',
    'GeoRegionID',
    'OSBuildLab',
    'EngineVersion',
    'OSGenuineState',   
    'CountryID',
    'PlatformType',     
    'SKUEditionName',   
    'AppVersion',
    'ChassisType'
]

#converting top- n columns to strings

for col in top_n:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

#add the top-n encoding pipeline for each column
for col in top_n:
    transformers.append(
        (f"top_n_{col}", top_n_pipeline, [col])
    )


In [None]:
train['ProcessorManufacturerID'].value_counts()

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
    ('num_median', numeric_median_pipeline, numeric_cols_median),
    ('binary_cat', binary_cat_pipeline, binary_cols),
    ('cat_mode', cat_mode_pipeline, cat_mode),
    # ('top_n_cat', top_n_pipeline, top_n)
    ] + transformers,
    remainder='drop'
)


## Fitting preprocessor and verifying

In [None]:
# Separate target from training
y_train = train['target']
X_train = train.drop(columns=['target'])

# Also drop 'target' from test if it exists
# (sometimes test doesn't have target at all)
X_test = test.drop(columns=['target'], errors='ignore')

# Now fit/transform on X only
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


The warning is just a heads up that the model is encountering new categories at the inference time, this is usually okay to leave as is. We can check the size of the new dataset just to be sure though.

In [None]:
print(train.shape, X_train_processed.shape)

We have learnt how one hot encoders work and blow up the size of the data, so this is expected. Lets verify this using a calculation

In [None]:
#Inspect how many columns each part produces

total_cols = 0
for name, trans, cols in preprocessor.transformers_:
    # Skip dropped or empty groups
    if trans == 'drop' or len(cols) == 0:
        continue

    # Extract just the columns handled by this transformer
    subset = X_train[cols]

    # Transform that subst alone (the pipeline/transform is already fitted)
    subset_t = trans.transform(subset)

    # Count how many columns we now have
    n_cols = subset_t.shape[1]
    total_cols += n_cols

    print(f"Transformer '{name}' on columns {cols} -> {n_cols} columns")

print(f"\nSum of columns from all transformers: {total_cols}")
print(f"Columns in final output: {X_train_processed.shape[1]}")


Since the expected number of columns matches the resultant, we can now move forward

In [None]:
from sklearn.model_selection import train_test_split


X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
    X_train_processed,  # not the raw X_train
    y_train,
    test_size=0.2,
    random_state=1
)

Looking for null values

In [None]:
df_part = pd.DataFrame(X_train_part)
print(df_part.isnull().sum())  # check for NaNs in each column
print("Rows with NaN:", df_part.isnull().any(axis=1).sum())


# Model Training

## Random Forest
Since we have tabular data we will start off by Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(random_state=1)
clf.fit(X_train_part, y_train_part)
y_pred = clf.predict(X_val_part)

print("Accuracy:", accuracy_score(y_val_part, y_pred))
print(classification_report(y_val_part, y_pred))
sns.heatmap(confusion_matrix(y_val_part, y_pred), annot=True, cmap='Blues', fmt='g')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

lr_clf = LogisticRegression(random_state=1, max_iter=2000)
lr_clf.fit(X_train_part, y_train_part)

y_pred_lr = lr_clf.predict(X_val_part)

print("Logistic Regression Accuracy:", accuracy_score(y_val_part, y_pred_lr))
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_val_part, y_pred_lr), annot=True, cmap='Blues', fmt='g')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

###  Hyperparametertuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    # note that lbfgs doesn't support l1 penalty, so we stick to liblinear or saga
    'solver': ['liblinear', 'saga']
}

# Initialize Logistic Regression (no fixed hyperparameters yet)
lr_base = LogisticRegression(max_iter=2000, random_state=1)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=lr_base,
    param_grid=param_grid,
    scoring='accuracy',  # since the competition takes accuracy :)
    cv=5,                # 5-fold cross-validation
    n_jobs=-1            # Use all available CPU cores
)

# Fit the grid search on the training data
grid_search.fit(X_train_part, y_train_part)

# Check best parameters and best CV score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Evaluate on the validation set
best_lr = grid_search.best_estimator_
y_pred = best_lr.predict(X_val_part)

print("\nValidation Accuracy:", accuracy_score(y_val_part, y_pred))
print(classification_report(y_val_part, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val_part, y_pred))


# LightGBM

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create the LightGBM model with best params
lgb_best = lgb.LGBMClassifier(
    learning_rate=0.1,
    max_depth=-1,
    n_estimators=100,
    num_leaves=31,
    random_state=42
)

# Fit the model on your training split
lgb_best.fit(X_train_part, y_train_part)

# Predict on validation
y_pred_val = lgb_best.predict(X_val_part)

# Evaluate
val_acc = accuracy_score(y_val_part, y_pred_val)
print("Validation Accuracy (LightGBM best params):", val_acc)
print("Classification Report:")
print(classification_report(y_val_part, y_pred_val))
print("Confusion Matrix:")
print(confusion_matrix(y_val_part, y_pred_val))


# XGBoost

In [None]:
import xgboost as xgb

# Training XGBoost classifier
xgb_clf = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_part, y_train_part)

# Make predictions on the validation set
y_pred_xgb = xgb_clf.predict(X_val_part)

# Print accuracy and classification report
print("XGBoost Accuracy:", accuracy_score(y_val_part, y_pred_xgb))
print("Classification Report:")
print(classification_report(y_val_part, y_pred_xgb))

# Compute the confusion matrix
cm = confusion_matrix(y_val_part, y_pred_xgb)
print("Confusion Matrix:")
print(cm)

# Plot the confusion matrix with a blue colormap
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix for XGBoost")
plt.show()


### Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

xgb_clf = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
lgb_clf = lgb.LGBMClassifier(random_state=42)

stack_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_clf),
        ('lgb', lgb_clf)
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    cv=3  # 3-fold cross-validation for base estimators
)

# Fit the stacking model on your training split
stack_clf.fit(X_train_part, y_train_part)

# Predict on validation
y_pred_stack = stack_clf.predict(X_val_part)

# Evaluate
acc_stack = accuracy_score(y_val_part, y_pred_stack)
print("Stacking (XGB + LGB) Accuracy:", acc_stack)
print("Classification Report (Stacking):")
print(classification_report(y_val_part, y_pred_stack))

print("Confusion Matrix (Stacking):")
print(confusion_matrix(y_val_part, y_pred_stack))

# Hyperparameter tuning best Model

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# Baseline default parameters for LightGBM
default_params = {
    'learning_rate': 0.1,
    'n_estimators': 100,
    'num_leaves': 31,
    'max_depth': -1
}

# Initialize the LightGBM classifier with default parameters
lgb_clf = lgb.LGBMClassifier(**default_params)

# Define the parameter grid including the default values
param_grid = {
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500],
    'num_leaves': [31, 40, 50],
    'max_depth': [-1, 10, 20, 30]
}

# Set up GridSearchCV (5-fold cross-validation, using accuracy for scoring)
grid_search = GridSearchCV(estimator=lgb_clf,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=5,
                           n_jobs=-1,
                           verbose=1)

# Fit grid search on the training data
grid_search.fit(X_train_processed, y_train)

# Print the best parameters and cross-validation accuracy
print("Best parameters:", grid_search.best_params_)
print("Best CV accuracy:", grid_search.best_score_)


# Submission

LightBGM implements gradient boosting descision trees which are known to [excel on structured (tabular) data](https://medium.com/geekculture/why-tree-based-models-beat-deep-learning-on-tabular-data-fcad692b1456). Hence the final model comes to no surprise.

I would like to extend my gratitude to the mentors of MLP Project (Jan term 2025) and the mentors of MLP theory course, MLF theory course and TDS at IITM (BSc Data Science)

In [None]:
X_test_processed = preprocessor.transform(test)

grid_search.fit(X_train_processed, y_train)

best_model = grid_search.best_estimator_

test_preds = best_model.predict(X_test_processed)

submission = pd.DataFrame({
    'id': test.index,  
    'target': test_preds
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")
print(submission.head())