In [49]:
import pandas as pd

In [51]:
df = pd.read_csv("Android_Ransomeware.csv")

In [52]:
df = df.dropna()

In [37]:
fraction_to_keep = 0.27

# Randomly sample a fraction of the data
df = df.sample(frac=fraction_to_keep, random_state=1)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392034 entries, 0 to 392033
Data columns (total 74 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Total Length of Bwd Packets  392034 non-null  int64  
 1    Fwd Packet Length Max        392034 non-null  int64  
 2    Fwd Packet Length Min        392034 non-null  int64  
 3    Fwd Packet Length Mean       392034 non-null  float64
 4    Fwd Packet Length Std        392034 non-null  float64
 5   Bwd Packet Length Max         392034 non-null  int64  
 6    Bwd Packet Length Min        392034 non-null  int64  
 7    Bwd Packet Length Mean       392034 non-null  float64
 8    Bwd Packet Length Std        392034 non-null  float64
 9   Flow Bytes/s                  392034 non-null  float64
 10   Flow Packets/s               392034 non-null  float64
 11   Flow IAT Mean                392034 non-null  float64
 12   Flow IAT Std                 392034 non-nul

In [39]:
df = df.drop(df.columns[0], axis=1)


In [54]:
df.columns

Index([' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count',
       ' SYN Flag Count', ' RST Flag Count', ' PSH Flag Count',
       ' ACK Flag Count', ' URG Flag Count', ' CWE F

In [None]:
df['Label'].unique()

array(['Benign', 'Charger', 'Jisut', 'Koler', 'Lockerpin', 'Pletor',
       'PornDroid', 'RansomBO', 'Simplocker', 'SVpeng', 'WannaLocker'],
      dtype=object)

In [None]:
df.info()

In [11]:
print(df.select_dtypes(include=['object']).columns.tolist())

['Flow ID', ' Source IP', ' Destination IP', ' Timestamp', 'Label']


In [20]:
# Encode the target variable 'Label'
from sklearn.preprocessing import LabelEncoder

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

label_encoder = LabelEncoder()
df['Label_Encoded'] = label_encoder.fit_transform(df['Label'])

# Drop irrelevant columns (if any)
# For example, if 'Flow ID', 'Source IP', 'Destination IP', and 'Timestamp' are irrelevant for the model
df.drop(['Flow ID', ' Source IP', ' Destination IP', ' Timestamp', 'Label'], axis=1, inplace=True)

# Calculate the correlation between each feature and the target variable
correlation_with_target = df.corrwith(df['Label_Encoded']).abs().sort_values(ascending=False)

# Print the correlation values
print(correlation_with_target)

Label_Encoded                   1.000000
 Fwd IAT Max                    0.062225
 Fwd IAT Std                    0.055474
 Flow IAT Max                   0.050909
 Flow IAT Std                   0.046542
Fwd IAT Total                   0.043260
 Flow IAT Mean                  0.040639
 Fwd IAT Mean                   0.040617
 Flow Duration                  0.039741
 Fwd Packet Length Max          0.035355
 Idle Min                       0.034722
 Max Packet Length              0.034276
Idle Mean                       0.032947
 Idle Max                       0.030517
 Packet Length Variance         0.029236
 Bwd Packets/s                  0.025356
 Flow IAT Min                   0.024085
 Fwd Packet Length Std          0.020716
Fwd PSH Flags                   0.020556
 SYN Flag Count                 0.020556
 Fwd IAT Min                    0.019376
 Flow Packets/s                 0.019177
FIN Flag Count                  0.018703
Flow Bytes/s                    0.017558
 Packet Length S

In [None]:
df['Bwd PSH Flags'].info()

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFE

# Assuming you have already loaded the DataFrame 'df'
# df = pd.read_csv("your_dataset.csv")

# Encode the target variable 'Label'
label_encoder = LabelEncoder()
df['Label_Encoded'] = label_encoder.fit_transform(df['Label'])

# Split the data into features (X) and target variable (y)
X = df.drop(['Label', 'Label_Encoded'], axis=1)
y = df['Label_Encoded']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest model
rf_model = RandomForestClassifier(random_state = 42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)

# Decode the predicted labels back to original string labels
rf_preds_labels = label_encoder.inverse_transform(rf_preds)
svm_preds_labels = label_encoder.inverse_transform(svm_preds)

In [43]:
# Evaluate Random Forest model
rf_accuracy = accuracy_score(y_test, rf_preds)
rf_precision = precision_score(y_test, rf_preds, average='weighted')
rf_recall = recall_score(y_test, rf_preds, average='weighted')
rf_f1 = f1_score(y_test, rf_preds, average='weighted')

# Evaluate SVM model
svm_accuracy = accuracy_score(y_test, svm_preds)
svm_precision = precision_score(y_test, svm_preds, average='weighted')
svm_recall = recall_score(y_test, svm_preds, average='weighted')
svm_f1 = f1_score(y_test, svm_preds, average='weighted')

# Print the results
print("Random Forest Model:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)
print("\n")

print("SVM Model:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1-score:", svm_f1)

Random Forest Model:
Accuracy: 0.3155880963627775
Precision: 0.3084370277190041
Recall: 0.3155880963627775
F1-score: 0.31102275833675963


SVM Model:
Accuracy: 0.158715162966462
Precision: 0.18023196391956334
Recall: 0.158715162966462
F1-score: 0.09620536594711253
