In [None]:
# Author: Xavier Tidus Hutchinson
# University: UNSW Australian Defence Force Academy, Canberra ACT, AU.
# Licence: My work is free for all to use. Other licences may apply.
#
# Special Thanks & Recognition: 
# This notebook leverages the fantastic work and research of UNB and requries their published
# CICMalDroid2020 Datasets to function.
#
# You can request access to your own copies by visiting their website here: https://www.unb.ca/cic/datasets/maldroid-2020.html
#
# Please credit the authors in your work and maintain refrences.
#
#
# Be careful, all sense of safety is an illiusion. 
# It's a wild world.

In [None]:
%%bash
# The dataset contains quite a lot of files. The benign dataset is enormous, as are the sum total of malicious 
# files. Thus, we may benefit from taking a random sampleset from each.
dir_working_path_root="./"
 
dir_source_apk_path_root="/mnt/hgfs/vmshare/CICMalwareSamples/source-apks"
dir_source_apk_working_path_root="$dir_working_path_root/source-apks"
 
dir_malicious_source_apks="$dir_source_apk_path_root/malicious"
dir_malicious_working_apks="$dir_source_apk_path_root/working/malicious"
dir_benign_source_apks="$dir_source_apk_path_root/benign"
dir_benign_working_apks="$dir_source_apk_path_root/working/benign"
 
# Where the specific malware source apks are stored
dir_malicious_source_apks_adware="$dir_malicious_source_apks/adware"
dir_malicious_source_apks_banking="$dir_malicious_source_apks/banking"
dir_malicious_source_apks_riskware="$dir_malicious_source_apks/riskware"
dir_malicious_source_apks_sms2="$dir_malicious_source_apks/sms2"
 
# Depending on your goals you may wish to include only some or all of the below.
# 1 = True, 0 = False
include_banking_in_sample=1
include_adware_in_sample=1
include_riskware_in_sample=1
include_sms2_in_sample=1

In [None]:
%%bash
#create working directories if they don't already exist and if they do exist, empty them.
mkdir -p $dir_source_apk_working_path_root
rm -R $dir_source_apk_working_path_root/*
 
mkdir -p $dir_benign_working_apks
rm -R $dir_benign_working_apks/*
 
mkdir -p $dir_malicious_working_apks
rm -R $dir_malicious_working_apks/*

In [None]:
%%bash
# Let's list the pre-extracted source samples available to us to leverage
echo "MALICIOUS SAMPLES"
echo "Adwords Samples: $(find $dir_malicious_source_apks_adware -max-deth 1 -f | wc -l)"
echo "Banking Samples: $(find $dir_malicious_source_apks_banking -max-deth 1 -f | wc -l)"
echo "Riskware Samples: $(find $dir_malicious_source_apks_riskware -max-deth 1 -f | wc -l)"
echo "SMS2 Samples: $(find $dir_malicious_source_apks_sms2 -max-deth 1 -f | wc -l)"
echo "BENIGN SAMPLES"
echo "Benign Samples: $(find $dir_benign_source_apks -max-deth 1 -f | wc -l)"

In [None]:
%%bash
# What is the target sampleset (expressed as units)? Default = 100
target_sample_set=100
 

In [None]:
%%bash
#and then the same again for the malicious apks.
for apk in ~/apks/malicious/Banking/*.apk; do
    apktool d "$apk" -o ~/apks/decompiled/malicious/"$(basename "$apk" .apk)"
done

In [None]:
%%bash
#First we will extract the APKs for our begnin apks. 
 
for apk in ~/apks/benign/Benign/*.apk; do
    apktool d "$apk" -o ~/apks/decompiled/benign/"$(basename "$apk" .apk)"
done

In [None]:
%%bash
# We can verify if the extraction using APKTool was successful by checking for the smali folders in each
ls ~/apks/decompiled/benign/*/smali
ls ~/apks/decompiled/malicious/*/smali

In [None]:
# General imports
import os
import glob

In [None]:
# define a function that we can use to extract our smali data
 
def extract_smali(indir, output_file):
    with open(output_file, 'w') as of:
        #begin our walk through the mine field
        for smali_file in glob.glob(f"{input_dir}/**/smali/**/*.smali", recursive=True):
            with open(smali_file, 'r') as infile:
                of.write(infile.read() + '\n')

In [None]:
# Set our working paths
dir_benign = '~/apks/decompiled/benign/Benign'
dir_malicious = '~/apks/decompiled/malicious/Banking'
dir_benign_output = '~/apks/benign_smali.txt'
dir_malicious_output = '~/apks/malicioius_smali.txt'

In [None]:
# Perform the extraction of our smali code for malicious and bengin
 
extract_smali(dir_benign, dir_benign_output)
extract_smali(dir_malicious, dir_malicious_output)

In [None]:
%%bash
# Verify our output
ls ~/apks/*.txt
head -n 10 ~/apks/benign_smali.txt

In [None]:
# N-GRAM time
#
# you can adjust the n weight below (my default: 3,3):
n_gram_weight_min = 3
n_gram_weight_max = 3
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
 
#process the smali code
with open(dir_benign_output, 'r') as f:
    benign_text = [f.read()]
 
with open(dir_malicious_output, 'r') as f:
    malicious_text = [f.read()]
 
# Initialise a CounVectorizer for 3-grams
vectorizer = CountVectorizer(analyzer='word', ngram_range=(n_gram_weight_min,n_gram_weight_max), token_pattern=r'\S+')
 
#combine texts for vocab
combined_text = benign_text + malicious_text
X = vectorizer.fir_transform(combined_text)
 
#get counts
benign_counts = X[0].toarray().flatten()
malicious_counts = X[0].toarray().flatten()
 
# take the top 50 n-grams by total frequency
total_counts = benign_counts + malicious_counts
top_indicies = np.argsort(total_counts)[::-1][:50]
top_ngrams = ngrams[top_indicies]
top_benign_counts = benign_counts[top_indicies]
top_malicious_counts = malicious_counts[top_indicies]
 
#save our data for the plotting work
np.savez('ngram_data.npz', ngrams=top_ngrams, benign=top_benign_counts, malicious=top_malicious_counts)
 
# now we will plot our work
import numpy as np
import matplotlib.pyplot as plt
 
# Load n-gram data
data = np.load('ngram_data.npz')
ngrams = data['ngrams']
benign_counts = data['benign']
malicious_counts = data['malicious']
 
# Plot settings
x = np.arange(len(ngrams))
width = 0.35
 
# Create figure
plt.figure(figsize=(15, 8))
plt.bar(x - width/2, benign_counts, width, label='Benign', color='#1f77b4')
plt.bar(x + width/2, malicious_counts, width, label='Malicious', color='#ff7f0e')
plt.xlabel('N-Grams')
plt.ylabel('Frequency')
plt.title('Top 50 3-Gram freq-dist. in Benign v. Malicious APKs')
plt.xticks(x, ngrams, rotation=90)
plt.legend()
 
# Add description text at the bottom
description = (
    "[Xavier Hutchinson (z5626926) ZEIT8025 Research Project 2025 S1]"
    "Generated using Matplotlib 3.5.1 in Python on Kali Linux. Decompiled .apk files were "
    "processed to extract 3-grams using CountVectorizer. Frequencies of top 50 3-grams in "
    "benign vs. malicious apps were plotted as a bar chart, highlighting distinct patterns "
    "(e.g., higher network call n-grams in malicious apps)."
)
plt.figtext(0.5, 0.01, description, wrap=True, horizontalalignment='center', 
            fontsize=10, bbox=dict(facecolor='white', alpha=0.8))
 
# Adjust layout to prevent overlap
plt.tight_layout(rect=[0, 0.1, 1, 1])  # Leave space at bottom for text
 
# Save and show plot
plt.savefig('./ngram_frequency.png', dpi=300, bbox_inches='tight')
plt.show()