In [1]:
import pandas as pd

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier

### Data Loading and Preprocessing

In this cell, we load and preprocess data from several CSV files containing features of both benign and malicious packages. The goal is to combine these datasets for further analysis.

- **Benign Packages**:
  - We load two datasets of benign packages:
    1. **Popular Benign Packages**: These are commonly downloaded packages.
    2. **Target Benign Packages**: These are packages that have been targeted by our typosquatting dataset

- **Malicious Packages**:
  - Three different datasets of malicious packages are loaded:
    1. **Backstabber Packages**: Known malicious packages provided by the backstabbers dataset [1]
    2. **MalOSS Packages**: A sample of 40 packages randomly selected, also marked as malicious [2]
    3. **Own Malicious Packages**: Malicious Typosquatting Packages we have collected from SonarType, Snyk and Phylum.

- After loading, a new column `"Malicious"` is added with a value of `False` or `True` to indicate that these are benign or malicious. We also drop the `"modules_used"` column, as it has been converted into a boolean feature vector.

[1] https://dasfreak.github.io/Backstabbers-Knife-Collection/ last accessed 02.04.2024 <br>
[2] https://github.com/osssanitizer/maloss-samples last accessed 06.04.2024ther inspection.


In [2]:
benign_popular = pd.read_csv(r".\Packages\Benign Packages\Most Downloaded\feature.csv")
benign_popular["Malicious"] = False
benign_popular.drop("modules_used", axis=1, inplace=True)

benign_target = pd.read_csv(r".\Packages\Benign Packages\Target Packages\feature.csv")
benign_target["Malicious"] = False
benign_target.drop("modules_used", axis=1, inplace=True)

malicious_backstabber = pd.read_csv(r".\Packages\Malware Backstabbers\feature.csv")
malicious_backstabber["Malicious"] = True
malicious_backstabber.drop("modules_used", axis=1, inplace=True)

malicious_maloss = pd.read_csv(r".\Packages\Malware MalOSS\feature.csv")
malicious_maloss = malicious_maloss.sample(40, random_state=0)
malicious_maloss["Malicious"] = True
malicious_maloss.drop("modules_used", axis=1, inplace=True)

malicious_own = pd.read_csv(r".\Packages\Malware Own\feature.csv")
malicious_own["Malicious"] = True
malicious_own.drop("modules_used", axis=1, inplace=True)

print(f"Benign - Popular: {len(benign_popular)}")
print(f"Benign - Target: {len(benign_target)}")
print(f"Malicious - Backstabber: {len(malicious_backstabber)}")
print(f"Malicious - MalOSS: {len(malicious_maloss)}")
print(f"Malicious - Own: {len(malicious_own)}")

benign_sample = pd.concat([benign_popular, benign_target], axis=0).sort_index(axis=1).sample(332, random_state=0)
benign_sample.head()

Benign - Popular: 980
Benign - Target: 232
Malicious - Backstabber: 1610
Malicious - MalOSS: 40
Malicious - Own: 394


Unnamed: 0,Malicious,axios,child_process,crypto,curl,dns,entry_through_script,eval,fs,has_bash_file,has_ip_or_address,https_or_http,node-fetch,node-serialize,os,package_name,path,querystring
198,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,cuid,False,False
29,False,False,False,False,False,False,False,False,True,False,True,False,False,False,True,chokidar,True,False
55,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,babel-plugin-lodash,True,False
743,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,react-virtualized,False,False
907,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,url-join,False,False


### Data Preparation for Model Training

This cell prepares multiple datasets by combining different subsets of data and splitting them into feature sets (`X`) and target labels (`Y`). These datasets will be used for training and evaluating machine learning models.

- **Dataset Creation**:
  - For each dataset, we concatenate benign and malicious samples.
  - Missing values are dropped to ensure data consistency.
  - Each dataset is then split into:
    - `X`: Contains the feature columns, excluding the `Malicious` and `package_name` columns.
    - `Y`: Contains the target labels (the `Malicious` column), indicating whether a package is benign or malicious.

- **Datasets**:
  1. **Internal Data**: Combines internal malicious and target benign samples.
  2. **Backstabber Data**: Combines backstabber malicious samples with popular benign packages.
  3. **MalOSS Data**: A combination of malicious MalOSS samples and a random sample of benign packages.
     
Each dataset is now ready for further analysis or model training, with clearly separated feature sets and target labels.
d target labels.
bels (`Malicious` column).


In [3]:
internal_data = pd.concat([malicious_own, benign_target], axis=0).sort_index(axis=1)
internal_data.dropna(inplace=True)
internal_data_X = internal_data.drop(["Malicious", "package_name"], axis=1)
internal_data_Y = internal_data["Malicious"]

backstabber_data = pd.concat([malicious_backstabber, benign_popular], axis=0).sort_index(axis=1)
backstabber_data.dropna(inplace=True)
backstabber_data_X = backstabber_data.drop(["Malicious", "package_name"], axis=1)
backstabber_data_Y = backstabber_data["Malicious"]

maloss_data = pd.concat([malicious_maloss, benign_sample], axis=0).sort_index(axis=1)
maloss_data.dropna(inplace=True)
maloss_data_X = maloss_data.drop(["Malicious", "package_name"], axis=1)
maloss_data_Y = maloss_data["Malicious"]

### Model Training with Random Forest Classifier

In this cell, we train a Random Forest Classifier on the internal dataset. The steps are as follows:

- **Model Initialization**:
  - A `RandomForestClassifier` is initialized with the following parameters:
    - `n_estimators=1500`: The model will use 1,500 decision trees.
    - `random_state=0`: Ensures reproducibility of results.
    - `oob_score=True`: Enables out-of-bag (OOB) score evaluation, which provides a validation score based on samples not included in the bootstrap sample.
    - `bootstrap=True`: Enables bootstrapping, where samples are drawn with replacement to build the trees.
    - `max_depth=15`: Limits the maximum depth of each tree to prevent overfitting.
    - `min_samples_split=7`: Specifies the minimum number of samples required to split a node.

- **Data Splitting**:
  - The dataset is split into training and testing sets:
    - `X_train`, `X_test`: The feature sets for training and testing.
    - `Y_train`, `Y_test`: The corresponding labels for training and testing.
  - We use an 80/20 split, with 80% of the data for training and 20% for testing (`test_size=0.20`), and `random_state=0` for reproducibility.

- **Model Training**:
  - The `RandomForestClassifier` is trained using the training data (`X_
The hyperparameters has been decided using Gridsearch. The code for the Gridsearch can be seen further down.train`, `Y_train`).


In [4]:
rfc = RandomForestClassifier(n_estimators=1500, random_state=0, oob_score=True, bootstrap=True, max_depth=15, min_samples_split=7)
X_train, X_test, Y_train, Y_test = train_test_split(internal_data_X, internal_data_Y, test_size=0.20, random_state=0)
rfc.fit(X_train, Y_train)

### Model Evaluation on Test Data

In this cell, we evaluate the performance of the trained Random Forest Classifier on the test data using several metrics:

- **Accuracy**:
  - We calculate the model’s accuracy on the test set using the `.score()` method, which provides the ratio of correct predictions to total predictions.

- **Predictions**:
  - The model's predictions (`y_pred`) on the test data are generated using the `.predict()`prediction.

- **Classification Report**:
  - A detailed classification report is printed using the `classification_report()` function. This report includes key metrics such as:
    - **Precision**: The proportion of positive identifications that are actually correct.
    - **Recall**: The proportion of actual positives that were correctly identified.
    - **F1-score**: The harmonic mean of precision and recall, giving a balanced measure of performance.
    - **Support**: The number of occurrences of each class in the test set.
    
  The metrics are displayed with five decimal points for precision.


In [5]:
accuracy = rfc.score(X_test, Y_test)
y_pred = rfc.predict(X_test)
print(classification_report(Y_test, y_pred, digits=5))

              precision    recall  f1-score   support

       False    0.97826   0.97826   0.97826        46
        True    0.98750   0.98750   0.98750        80

    accuracy                        0.98413       126
   macro avg    0.98288   0.98288   0.98288       126
weighted avg    0.98413   0.98413   0.98413       126



### Model Prediction on New Data (MalOSS Dataset)

In this cell, we use the trained Random Forest Classifier to make predictions on a new dataset (`maloss_data_X`) and evaluate its performan.
The number of Malicious and Benign Packages has been chosen in such a way, to increase the comparability to Sejfia and Schäfer. greater precision.


In [6]:
y_pred = rfc.predict(maloss_data_X)

accuracy = rfc.score(maloss_data_X, maloss_data_Y)
print(classification_report(maloss_data_Y, y_pred, digits=5))

              precision    recall  f1-score   support

       False    0.96210   0.99398   0.97778       332
        True    0.93103   0.67500   0.78261        40

    accuracy                        0.95968       372
   macro avg    0.94657   0.83449   0.88019       372
weighted avg    0.95876   0.95968   0.95679       372



### Model Prediction on Backstabber Dataset

In this cell, we use the trained Random Forest Classifier to predict the target variable on the **Backstabber Dataset** (`backstabber_data_X`) and evaluate its performance.

In [7]:
y_pred = rfc.predict(backstabber_data_X)

accuracy = rfc.score(backstabber_data_X, backstabber_data_Y)
print(classification_report(backstabber_data_Y, y_pred, digits=5))

              precision    recall  f1-score   support

       False    0.85841   0.99081   0.91987       979
        True    0.99383   0.90062   0.94493      1610

    accuracy                        0.93472      2589
   macro avg    0.92612   0.94571   0.93240      2589
weighted avg    0.94262   0.93472   0.93545      2589



### Hyperparameter Tuning with GridSearchCV for Random Forest

In this cell, we use **GridSearchCV** to perform hyperparameter tuning for a Random Forest Classifier. The goal is to find the best combination of hyperparameters to optimize the model's performance.

- **Model Initialization**:
  - We initialize a `RandomForestClassifier` with the following default settings:
    - `random_state=0` for reproducibility.
    - `oob_score=True` to enable out-of-bag score estimation.
    - `bootstrap=True` for bootstrapping the data to train individual trees.

- **Hyperparameter Grid**:
  - A grid of hyperparameters is defined for tuning the model:
    - `n_estimators`: The number of trees in the forest. We try values of 500, 1000, 1500, and 2000.
    - `max_depth`: The maximum depth of the trees, with values of 5, 10, 15, and 20.
    - `min_samples_split`: The minimum number of samples required to split an internal node, with values of 3, 5, 7, and 10.

- **GridSearchCV Setup**:
  - We create a `GridSearchCV` object with:
    - `estimator=rfc`: The Random Forest model.
    - `param_grid=param_grid`: The hyperparameter grid defined earlier.
    - `scoring='recall'`: The recall metric is used for evaluating model performance, focusing on minimizing false negatives.
    - `cv=5`: 5-fold cross-validation is used for model evaluation.

- **Model Fitting**:
  - We fit the grX_trainata (`bY_train and `backstabber_data_Y`), which runs the hyperparameter search and identifies the best model based on the recall score.

- **Best Hyperparameters**:
  - After the grid search, the best combination of hyperparameters is stored in `best_params`, and the best model is stored in `best_model`. The best hyperparameters are printed.

- **Model Evaluation**:
  - The best model is evaluated on the test data (`X_test`, `Y_test`), and the accuracy of the model is printed.


In [8]:
rfc = RandomForestClassifier(random_state=0, oob_score=True, bootstrap=True)

# Define the hyperparameter grid to search through
param_grid = {
    'n_estimators': [1500, 2000, 2500, 3000],  # Number of trees in the forest
    'max_depth': [15, 20, 25, 30],  # Maximum depth of the trees
    'min_samples_split': [7, 10, 13, 15],  # Minimum number of samples required to split an internal node
}

# Create GridSearchCV object with the RandomForestClassifier and the hyperparameter grid
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring='recall', cv=5)

# Fit the grid search to the data (perform grid search)
grid_search.fit(X_train, Y_train)

# Get the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the test set
accuracy = best_model.score(X_test, Y_test)
print("Test Accuracy:", accuracy)

Best Hyperparameters: {'max_depth': 15, 'min_samples_split': 7, 'n_estimators': 1500}
Test Accuracy: 0.9841269841269841
