In [None]:
# Author: Xavier Tidus Hutchinson
# University: UNSW Australian Defence Force Academy, Canberra ACT, AU.
# Licence: My work is free for all to use. Other licences may apply.
#
# Special Thanks & Recognition: 
# This notebook leverages the fantastic work and research of UNB and requries their published
# CICMalDroid2020 Datasets to function.
#
# You can request access to your own copies by visiting their website here: https://www.unb.ca/cic/datasets/maldroid-2020.html
#
# Please credit the authors in your work and maintain refrences.
#
#
# Be careful, all sense of safety is an illiusion. 
# It's a wild world.

In [None]:
import os
import re
import glob
import numpy as np
import matplotlib.pyplot as plt
import shutil
import random
import subprocess
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer

# Directory setup from previous code
dir_working_path_root = os.path.expanduser("~/tmp")
dir_source_apk_path_root = os.path.join(dir_working_path_root, "source-apks")
dir_apk_working_path_root = os.path.join(dir_working_path_root, "ngram-working")

dir_malicious_source_apks = os.path.join(dir_source_apk_path_root, "malware_extracted")
dir_malicious_working_apks = os.path.join(dir_apk_working_path_root, "working", "malicious", "apks")
dir_malicious_working_decompiled = os.path.join(dir_apk_working_path_root, "working", "malicious", "decompiled")

dir_benign_source_apks = os.path.join(dir_source_apk_path_root, "benign_extracted")
dir_benign_working_apks = os.path.join(dir_apk_working_path_root, "working", "benign", "apks")
dir_benign_working_decompiled = os.path.join(dir_apk_working_path_root, "working", "benign", "decompiled")

dir_malicious_source_apks_adware = os.path.join(dir_malicious_source_apks, "adware")
dir_malicious_source_apks_banking = os.path.join(dir_malicious_source_apks, "banking")
dir_malicious_source_apks_riskware = os.path.join(dir_malicious_source_apks, "riskware")
dir_malicious_source_apks_sms2 = os.path.join(dir_malicious_source_apks, "sms")

include_banking_in_sample = True
include_adware_in_sample = True
include_riskware_in_sample = True
include_sms2_in_sample = True

# Function to get the number of files in a directory
def count_files_in_dir(directory):
    if not os.path.exists(directory):
        return 0
    return len([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])

# Function to calculate weighted sample sizes for malware types
def get_weighted_malware_samples(target_sample_size):
    # Count files in each malware directory
    counts = {}
    if include_adware_in_sample:
        counts['adware'] = count_files_in_dir(dir_malicious_source_apks_adware)
    if include_banking_in_sample:
        counts['banking'] = count_files_in_dir(dir_malicious_source_apks_banking)
    if include_riskware_in_sample:
        counts['riskware'] = count_files_in_dir(dir_malicious_source_apks_riskware)
    if include_sms2_in_sample:
        counts['sms2'] = count_files_in_dir(dir_malicious_source_apks_sms2)
    
    total_files = sum(counts.values())
    if total_files == 0:
        print("No files found in any selected malware directories.")
        return {}
    
    # Calculate weighted sample sizes
    weighted_samples = {}
    remaining_samples = target_sample_size
    
    for category, count in counts.items():
        if count == 0:
            weighted_samples[category] = 0
            continue
        # Proportion based on file count
        proportion = count / total_files
        sample_size = int(round(proportion * target_sample_size))
        weighted_samples[category] = min(sample_size, count, remaining_samples)
        remaining_samples -= weighted_samples[category]
    
    # Distribute any remaining samples to categories with files
    while remaining_samples > 0 and any(count > weighted_samples[cat] for cat, count in counts.items()):
        for category in counts:
            if remaining_samples == 0:
                break
            if counts[category] > weighted_samples[category]:
                weighted_samples[category] += 1
                remaining_samples -= 1
    
    return weighted_samples

# Function to get all files in a directory
def get_files_in_dir(directory):
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

# Function to copy random sample of files
def copy_random_sample(source_dir, dest_dir, sample_size, dir_name):
    if not os.path.exists(source_dir):
        print(f"Source directory {source_dir} does not exist!")
        return
    
    files = get_files_in_dir(source_dir)
    total_files = len(files)
    
    if total_files == 0:
        print(f"No files found in {source_dir}")
        return
    
    if sample_size > total_files:
        print(f"Requested sample size ({sample_size}) exceeds available files ({total_files}) in {dir_name}. Using all available files.")
        sample_size = total_files
    
    # Randomly select files
    selected_files = random.sample(files, sample_size)
    
    # Copy selected files to destination
    for file_name in selected_files:
        src_path = os.path.join(source_dir, file_name)
        dst_path = os.path.join(dest_dir, file_name)
        shutil.copy2(src_path, dst_path)
    print(f"Copied {len(selected_files)} files from {dir_name} to {dest_dir}")

def extract_apk(apk_path, output_dir):
    # Build apktool command
    command = ["apktool", "d", apk_path, "-o", output_dir]
    
    try:
        # Run the command
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        print(f"Successfully decompiled {apk}:\n")
        if result.stderr:
            print(f"Warnings for {apk}:\n")
    except subprocess.CalledProcessError as e:
        print(f"Failed to decompile {apk}:\n")
    except FileNotFoundError:
        print("apktool not found. Ensure it is installed and in PATH.")

def extract_smali(input_dir, output_file):
    """
    Extract Smali code from decompiled .apk files into a single text file.
    
    Parameters:
    - input_dir (str): Directory containing decompiled .apk folders.
    - output_file (str): Path to save the concatenated Smali code.
    
    Returns:
    - None: Saves Smali code to output_file.
    """
    if not os.path.exists(input_dir):
        raise FileNotFoundError(f"Directory {input_dir} does not exist.")
    
    with open(output_file, 'w', encoding='utf-8', errors='ignore') as outfile:
        smali_files = glob.glob(f"{input_dir}/**/smali/**/*.smali", recursive=True)
        if not smali_files:
            raise FileNotFoundError(f"No .smali files found in {input_dir}")
        
        for smali_file in smali_files:
            with open(smali_file, 'r', encoding='utf-8', errors='ignore') as infile:
                outfile.write(infile.read() + '\n')
    
    print(f"Smali code extracted to {output_file}")

def generate_ngram_data(benign_smali_file, malicious_smali_file, output_npz='ngram_data.npz', ngram_size=3):
    """
    Generate n-gram frequency data from Smali code and save to .npz file.
    
    Parameters:
    - benign_smali_file (str): Path to benign Smali code text file.
    - malicious_smali_file (str): Path to malicious Smali code text file.
    - output_npz (str): Path to save .npz file.
    - ngram_size (int): Size of n-grams (default: 3).
    
    Returns:
    - None: Saves n-gram data to output_npz.
    """
    def clean_text(text):
        # Remove non-printable characters and normalize whitespace
        text = re.sub(r'[^\x20-\x7E]', ' ', text)  # Keep ASCII printable characters
        text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
        return text.strip()

    # Read and clean Smali code
    if not os.path.exists(benign_smali_file):
        raise FileNotFoundError(f"File {benign_smali_file} does not exist.")
    if not os.path.exists(malicious_smali_file):
        raise FileNotFoundError(f"File {malicious_smali_file} does not exist.")
    
    with open(benign_smali_file, 'r', encoding='utf-8', errors='ignore') as f:
        benign_text = [clean_text(f.read())]
    with open(malicious_smali_file, 'r', encoding='utf-8', errors='ignore') as f:
        malicious_text = [clean_text(f.read())]
    
    # Initialize CountVectorizer with a broader token pattern
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram_size, ngram_size), token_pattern=r'[a-zA-Z0-9_\-\.]+')
    
    # Combine texts for vocabulary
    combined_text = benign_text + malicious_text
    X = vectorizer.fit_transform(combined_text)
    
    # Get feature names and counts
    ngrams = vectorizer.get_feature_names_out()
    benign_counts = X[0].toarray().flatten()
    malicious_counts = X[1].toarray().flatten()
    
    # Debug: Print total counts
    total_counts = benign_counts + malicious_counts
    print(f"Total unique n-grams: {len(ngrams)}")
    print(f"Non-zero counts: {np.sum(total_counts > 0)}")
    
    # Save data
    np.savez(output_npz, ngrams=ngrams, benign=benign_counts, malicious=malicious_counts)
    print(f"N-gram data saved to {output_npz}")

def plot_ngram_frequency(data_path, output_path='ngram_frequency.png', top_n=50, 
                        benign_color='#1f77b4', malicious_color='#ff7f0e', dpi=300):
    """
    Plot N-Gram Frequency Distribution for benign vs. malicious Android apps.
    
    Parameters:
    - data_path (str): Path to .npz file containing n-gram data.
    - output_path (str): Path to save the plot.
    - top_n (int): Number of top n-grams to display.
    - benign_color (str): Color for benign bars.
    - malicious_color (str): Color for malicious bars.
    - dpi (int): Resolution of the saved image.
    
    Returns:
    - None: Saves and displays the plot.
    """
    # Load n-gram data
    data = np.load(data_path, allow_pickle=True)
    ngrams = data['ngrams']
    benign_counts = data['benign']
    malicious_counts = data['malicious']
    
    # Debug: Check data
    total_counts = benign_counts + malicious_counts
    if len(ngrams) < top_n or np.all(total_counts == 0):
        print(f"Warning: Insufficient data. Found {len(ngrams)} n-grams, all counts are {total_counts[:10]}")
        return
    
    # Sort by total frequency and take top_n
    sort_indices = np.argsort(total_counts)[::-1][:top_n]
    ngrams = ngrams[sort_indices]
    benign_counts = benign_counts[sort_indices]
    malicious_counts = malicious_counts[sort_indices]
    
    # Plot settings
    x = np.arange(len(ngrams))
    width = 0.35

    # Create figure
    fig, ax = plt.subplots(figsize=(15, 9))
    ax.bar(x - width/2, benign_counts, width, label='Benign', color=benign_color)
    ax.bar(x + width/2, malicious_counts, width, label='Malicious', color=malicious_color)
    
    # Customize axes
    ax.set_xlabel('N-Grams')
    ax.set_ylabel('Frequency')
    ax.set_title('Top 50 3-Gram Frequency Distribution in Benign vs. Malicious Apps')
    ax.set_xticks(x)
    ax.set_xticklabels(ngrams, rotation=90)
    ax.legend()
    
    # Adjust y-axis limits to fit data
    ax.set_ylim(bottom=0, top=max(benign_counts.max(), malicious_counts.max()) * 1.1)
    
    # Add description at the bottom
    description = (
        "Xavier Hutchinson (z5626926) ZEIT8025 Research Project 2025 S1. Generated using Matplotlib 3.5.1 "
        "in Python on macOS. Decompiled .apk files were processed to extract 3-grams using CountVectorizer. "
        "Frequencies of top 50 3-grams in benign vs. malicious apps were plotted as a bar chart, highlighting "
        "distinct patterns (e.g., higher network call n-grams in malicious apps)."
    )
    plt.figtext(0.5, -0.1, description, wrap=True, horizontalalignment='center', fontsize=10)

    # Adjust layout to prevent clipping
    plt.tight_layout()

    # Save and display
    plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
    plt.show()
    print(f"Plot saved to {output_path}")

In [None]:
# Print file counts for source directories
fc_benign = count_files_in_dir(dir_benign_source_apks)
fc_mal_adware = count_files_in_dir(dir_malicious_source_apks_adware)
fc_mal_banking = count_files_in_dir(dir_malicious_source_apks_banking)
fc_mal_riskware = count_files_in_dir(dir_malicious_source_apks_riskware)
fc_mal_sms2 = count_files_in_dir(dir_malicious_source_apks_sms2)

print(f"Number of files in benign source: {fc_benign}")
if include_adware_in_sample:
    print(f"Number of files in malware/adware source: {fc_mal_adware}")
if include_banking_in_sample:
    print(f"Number of files in malware/banking source: {fc_mal_banking}")
if include_riskware_in_sample:
    print(f"Number of files in malware/riskware source: {fc_mal_riskware}")
if include_sms2_in_sample:
    print(f"Number of files in malware/sms2 source: {fc_mal_sms2}")

In [None]:
# Verify directories
print("The following map of directories will be used:")
print(f"\t- Sample Output Directories:")
print(f"\t\t- Benign APKs: {dir_benign_working_apks}")
print(f"\t\t- Malware APKs: {dir_malicious_working_apks}")
print(f"\t\t  (Selected malware are combined beyond this step)")

In [None]:
# create directories
Path(dir_apk_working_path_root).mkdir(parents=True, exist_ok=True)
if os.path.exists(dir_apk_working_path_root):
    shutil.rmtree(dir_apk_working_path_root)
    os.makedirs(dir_apk_working_path_root)

Path(dir_benign_working_apks).mkdir(parents=True, exist_ok=True)
Path(dir_malicious_working_apks).mkdir(parents=True, exist_ok=True)

In [None]:
# Please select your sample size.
benign_sample_size = 25
malicious_sample_size = 50

# This is the amount of benign and the appropriately weighted (by representation) amounts of malware.
# A value of 100 for example, will copy 100 random benign apks for analysis, but will also copy 100 malware apks.
# If adware has 400 files, banking has 300, riskware has 200, sms2 has 100 (total 1000 files), and target_sample_size is 100:
# Proportions: adware (40%), banking (30%), riskware (20%), sms2 (10%).
# Initial allocation: adware (40), banking (30), riskware (20), sms2 (10).
# If rounding causes a shortfall (e.g., 99 samples allocated), the remaining 1 sample is given to a category with available files.
try:
    if benign_sample_size < 0:
        raise ValueError("Sample size cannot be negative")
    
    if malicious_sample_size < 0:
        raise ValueError("Sample size cannot be negative")
        
except ValueError as e:
    print(f"Invalid input: {e}. Please enter positive integers.")

# Get weighted sample sizes for malicious files
weighted_samples = get_weighted_malware_samples(malicious_sample_size)
if not weighted_samples:
    print("No malicious samples to process.")

# Sample malicious files based on weighted counts
dir_mapping = {
    'adware': dir_malicious_source_apks_adware,
    'banking': dir_malicious_source_apks_banking,
    'riskware': dir_malicious_source_apks_riskware,
    'sms2': dir_malicious_source_apks_sms2
}

print("Malware weighted allocation plan:")
for category, sample_size in weighted_samples.items():
    print(f"\t{category}: {sample_size}")

In [None]:
# Sample benign files
copy_random_sample(dir_benign_source_apks, dir_benign_working_apks, benign_sample_size, "benign")

for category, sample_size in weighted_samples.items():
    if sample_size > 0:
        copy_random_sample(dir_mapping[category], dir_malicious_working_apks, sample_size, "malicious")
    else:
        print(f"Skipping {category} (no samples allocated or no files available)")

In [None]:
malicious_apk_files = Path(dir_malicious_working_apks).glob("*.apk")
benign_apk_files = Path(dir_benign_working_apks).glob("*.apk")

for apk in malicious_apk_files:
    extract_apk(apk, f"{dir_malicious_working_decompiled}/{apk.stem}")

for apk in benign_apk_files:
    extract_apk(apk, f"{dir_benign_working_decompiled}/{apk.stem}")


In [None]:
# Perform the extraction of our smali code for malicious and bengin
extract_smali(dir_malicious_working_decompiled, f"{dir_apk_working_path_root}/malicioius_smali.txt")
extract_smali(dir_benign_working_decompiled, f"{dir_apk_working_path_root}/benign_smali.txt")

In [None]:
# generate ngram
generate_ngram_data(f"{dir_apk_working_path_root}/benign_smali.txt", f"{dir_apk_working_path_root}/malicioius_smali.txt", f'{dir_apk_working_path_root}/ngram_data.npz')

In [None]:
# Plot the ngram data
plot_ngram_frequency(f'{dir_apk_working_path_root}/ngram_data.npz', f'{dir_apk_working_path_root}/ngram_frequency.png')