<a href="https://colab.research.google.com/github/swati013/FeedTheSnake/blob/main/jupyter_notebooks/1.%20Dataset%20Pre-processing%20Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Preprocessing

### This notebook contains the works described in section-III(A) in the paper

In [None]:
# only need this line in jupyter
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# there are 215 features, need to edit pandas default display settings

#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Drebin dataset analysis  


In [None]:
drebin_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Android-Malware-Detection-ML/main/datasets/Drebin-215/drebin-215-dataset-5560malware-9476-benign.csv')

drebin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15036 entries, 0 to 15035
Columns: 216 entries, transact to class
dtypes: int64(214), object(2)
memory usage: 24.8+ MB


  drebin_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Android-Malware-Detection-ML/main/datasets/Drebin-215/drebin-215-dataset-5560malware-9476-benign.csv')


### Column-92('READ_EXTERNAL_STORAGE') had mixed dtype issue, fix by setting whole column to numeric

In [None]:
# convert column values to numeric
drebin_df['READ_EXTERNAL_STORAGE'] = pd.to_numeric(drebin_df['READ_EXTERNAL_STORAGE'])

In [None]:
drebin_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15036 entries, 0 to 15035
Columns: 216 entries, transact to class
dtypes: int64(214), object(2)
memory usage: 24.8+ MB


In [None]:
def check_missing_data(df):

    total_instances = df.shape[0]
    columns = df.columns

    for column in columns:
        one_zero_count = df[(df[column]==0) | (df[column]==1)].shape[0]
        if one_zero_count != total_instances:
            print(column, 'has', (total_instances-one_zero_count), 'rows with non 0-1 values')

In [None]:
check_missing_data(drebin_df)

TelephonyManager.getSimCountryIso has 8192 rows with non 0-1 values
class has 15036 rows with non 0-1 values


In [None]:
drebin_df['TelephonyManager.getSimCountryIso'].unique()

array(['0', '1', '?', 1, 0], dtype=object)

In [None]:
drebin_df[(drebin_df['TelephonyManager.getSimCountryIso']=='?')].shape[0]

5

### Column- TelephonyManager.getSimCountryIso has values- ['0', '1', '?', 1, 0]. Need to remove instances with '?' (only 5 instances) and convert '0' to 0 and '1' to 1

In [None]:
# drop rows with 'TelephonyManager.getSimCountryIso' = '?'
drebin_df = drebin_df[drebin_df['TelephonyManager.getSimCountryIso']!='?']

In [None]:
drebin_df['TelephonyManager.getSimCountryIso'].unique()

array(['0', '1', 1, 0], dtype=object)

In [None]:
# convert column values to numeric
drebin_df['TelephonyManager.getSimCountryIso'] = pd.to_numeric(drebin_df['TelephonyManager.getSimCountryIso'])

In [None]:
check_missing_data(drebin_df)

class has 15031 rows with non 0-1 values


In [None]:
drebin_df.shape

(15031, 216)

In [None]:
drebin_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15031 entries, 0 to 15035
Columns: 216 entries, transact to class
dtypes: int64(215), object(1)
memory usage: 24.9+ MB


In [None]:
class_freq = drebin_df['class'].value_counts()
class_freq

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
B,9476
S,5555


  
  # Malgenome Dataset Analysis

In [None]:
malgenome_df = pd.read_csv('https://raw.githubusercontent.com/ferdouszislam/Android-Malware-Detection-ML/main/datasets/malgenome-215/malgenome-215-dataset-1260malware-2539-benign.csv')

In [None]:
malgenome_df.shape

(3799, 216)

In [None]:
malgenome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Columns: 216 entries, transact to class
dtypes: int64(215), object(1)
memory usage: 6.3+ MB


In [None]:
check_missing_data(malgenome_df)

class has 3799 rows with non 0-1 values


### No problem with the Malgenome dataset

In [None]:
class_freq = malgenome_df['class'].value_counts()
class_freq

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
B,2539
S,1260


  
  # Merge the two datasets

In [None]:
# check if columns of both the datasets are same
drebin_columns_set = set(drebin_df.columns)
malgenome_columns_set = set(malgenome_df.columns)

In [None]:
all_columns = drebin_columns_set.union(malgenome_columns_set)
len(all_columns)

223

In [None]:
common_columns = drebin_columns_set.intersection(malgenome_columns_set)
len(common_columns)

209

In [None]:
not_common_columns = drebin_columns_set.symmetric_difference(malgenome_columns_set)
len(not_common_columns)

14

### The two datasets have 209 features in common ('common_columns'). We need to drop the extra columns ('not_common_columns') from the datasets.  


In [None]:
not_common_columns

{'.system.app',
 '.system.bin',
 '/system/app',
 '/system/bin',
 'BIND_TEXT_SERVICE',
 'BROADCAST_PACKAGE_REMOVED',
 'CONTROL_LOCATION_UPDATES',
 'DELETE_CACHE_FILES',
 'HARDWARE_TEST',
 'INJECT_EVENTS',
 'READ_INPUT_STATE',
 'Runtime.loadLibrary',
 'android.intent.action.CAMERA_BUTTON',
 'android.intent.action.REBOOT'}

In [None]:
drebin_drop_columns = []
malgenome_drop_columns = []

for drop_column in not_common_columns:
    if(drop_column in drebin_df.columns):
        drebin_drop_columns.append(drop_column)
    if(drop_column in malgenome_df.columns):
        malgenome_drop_columns.append(drop_column)

In [None]:
drebin_drop_columns

['DELETE_CACHE_FILES',
 'CONTROL_LOCATION_UPDATES',
 'HARDWARE_TEST',
 'Runtime.loadLibrary',
 '/system/app',
 '/system/bin',
 'BIND_TEXT_SERVICE']

In [None]:
# drop 'drebin_drop_clumns' from 'drebin_df'
for drop_column in drebin_drop_columns:
    drebin_df = drebin_df.drop(columns=[drop_column])

In [None]:
malgenome_drop_columns

['.system.bin',
 'android.intent.action.REBOOT',
 'READ_INPUT_STATE',
 'android.intent.action.CAMERA_BUTTON',
 '.system.app',
 'BROADCAST_PACKAGE_REMOVED',
 'INJECT_EVENTS']

In [None]:
# drop 'malgenome_drop_clumns' from 'malgenome_df'
for drop_column in malgenome_drop_columns:
    malgenome_df = malgenome_df.drop(columns=[drop_column])

In [None]:
# check if columns of both the datasets are same
drebin_columns_set = set(drebin_df.columns)
malgenome_columns_set = set(malgenome_df.columns)

In [None]:
common_columns = drebin_columns_set.intersection(malgenome_columns_set)
print(len(common_columns) , drebin_df.columns.shape[0], malgenome_df.columns.shape[0])

209 209 209


In [None]:
not_common_columns = drebin_columns_set.symmetric_difference(malgenome_columns_set)
len(not_common_columns)

0

In [None]:
# merge the two dataframes
merged_df = pd.concat([drebin_df, malgenome_df], ignore_index=True)

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18830 entries, 0 to 18829
Columns: 209 entries, transact to class
dtypes: int64(208), object(1)
memory usage: 30.0+ MB


In [None]:
check_missing_data(merged_df)

class has 18830 rows with non 0-1 values


  
  # Separate holdout dataset

In [None]:
from sklearn.model_selection import train_test_split

X_all = merged_df.drop(columns='class')
y_all = merged_df['class']

X, X_holdout, y, y_holdout = train_test_split(X_all, y_all, test_size=0.2, random_state=42, stratify=y_all)

# concat the test and holdout datasets
experimenting_df = pd.concat([X, y], axis=1).reset_index(drop=True)
holdout_df = pd.concat([X_holdout, y_holdout], axis=1).reset_index(drop=True)

In [None]:
experimenting_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
B,9612
S,5452


In [None]:
holdout_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
B,2403
S,1363


In [None]:
experimenting_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15064 entries, 0 to 15063
Columns: 209 entries, transact to class
dtypes: int64(208), object(1)
memory usage: 24.0+ MB


In [None]:
holdout_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3766 entries, 0 to 3765
Columns: 209 entries, transact to class
dtypes: int64(208), object(1)
memory usage: 6.0+ MB


  
  # Save the pre-processed datasets  

  
  ### Convert 'class' column values. 'B(benign)' -> 0, 'S(malware)' -> 1

In [None]:
drebin_df['class'] = drebin_df['class'].map({'B': 0, 'S': 1})
malgenome_df['class'] = malgenome_df['class'].map({'B': 0, 'S': 1})
merged_df['class'] = merged_df['class'].map({'B': 0, 'S': 1})
experimenting_df['class'] = experimenting_df['class'].map({'B': 0, 'S': 1})
holdout_df['class'] = holdout_df['class'].map({'B': 0, 'S': 1})

In [None]:
experimenting_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,9612
1,5452


In [None]:
holdout_df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
0,2403
1,1363


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18830 entries, 0 to 18829
Columns: 209 entries, transact to class
dtypes: int64(209)
memory usage: 30.0 MB


In [None]:
# save pre-processed Drebin dataset to new csv file
drebin_df.to_csv('../datasets/Pre-processed_Dataset/separate_datasets/Drebin.csv', index=False)
# save pre-processed Malgenome dataset to csv file
malgenome_df.to_csv('../datasets/Pre-processed_Dataset/separate_datasets/Malgenome.csv', index=False)
# save pre-processed & combined dataset to csv file
merged_df.to_csv('../datasets/Pre-processed_Dataset/separate_datasets/Drebin_Malgenome_Combined.csv', index=False)

# save Combined, train/experimenting dataset to csv file
experimenting_df.to_csv('../datasets/Pre-processed_Dataset/main_dataset/Drebin_Malgenome_Combined-experiment_80.csv', index=False)
# save Combined, test/holdout dataset to csv file
holdout_df.to_csv('../datasets/Pre-processed_Dataset/holdout_dataset/Drebin_Malgenome_Combined-holdout_20.csv', index=False)

OSError: Cannot save file into a non-existent directory: '../datasets/Pre-processed_Dataset/separate_datasets'