#### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer


#### Importing the Dataset
### Source : 

In [2]:
# Read the CSV file
url = 'data_url.csv'
url_csv = pd.read_csv(url, sep=',', on_bad_lines='skip')

# Convert the data from CSV to DataFrame
url_df = pd.DataFrame(url_csv)

# Convert the DataFrame to a NumPy array
url_df = np.array(url_df)

# Shuffle the array
random.shuffle(url_df)

#### Seperating the data according to it's characteristics

In [3]:
y = [d[1] for d in url_df]                 
urls = [d[0] for d in url_df]

#### Since the urls are different from our normal text documents, we need to use a sanitization method to get the relevant data from raw urls.

In [4]:
def sanitization(web):
    web = web.lower()
    token = []
    dot_token_slash = []
    raw_slash = str(web).split('/')
    for i in raw_slash:
        # removing slash to get token
        raw1 = str(i).split('-')
        slash_token = []
        for j in range(0,len(raw1)):
            # removing dot to get the tokens
            raw2 = str(raw1[j]).split('.')
            slash_token = slash_token + raw2
        dot_token_slash = dot_token_slash + raw1 + slash_token
    # to remove same words
    token = list(set(dot_token_slash))  
    if 'com' in token:
        #remove com
        token.remove('com')
    return token

#### We will have to pass the data to our custom vectorizer function using Tf-idf approach 

In [5]:
# term-frequency and inverse-document-frequency
vectorizer = TfidfVectorizer(tokenizer=sanitization)

#### Splitting the test set and train set

In [6]:
x = vectorizer.fit_transform(urls)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)




#### Training

In [7]:
lgr = LogisticRegression(solver='lbfgs', max_iter=1000)                  # Logistic regression
lgr.fit(x_train, y_train)
score = lgr.score(x_test, y_test)
print("score: {0:.2f} %".format(100 * score))
vectorizer_save = vectorizer

score: 98.40 %


#### Saving the modle and vectors

In [8]:
file = "pickel_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgr, f)
f.close()

file2 = "pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)
f2.close()

In [None]:
import os
import time
import pyfiglet
import subprocess

def run_PE():
    file = input("Enter the path and name of the file : ")
    os.system("python3 Extract/PE_main.py {}".format(file))

def run_URL():
    try:
        subprocess.run(['python', 'url_main.py'], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running url_main.py: {e}")
    else:
        print("URL scanner ran successfully")

def exit_program():
    os.system('exit')

def start():
    print(pyfiglet.figlet_format("Malware Detector"))
    print(" Welcome to antimalware detector \n")
    print(" 1. PE scanner")
    print(" 2. URL scanner")
    print(" 3. Exit\n")

    select = int(input("Enter your choice : "))

    if select in [1, 2, 3]:
        if select == 1:
            run_PE()
            choice = input("Do you want to search again? (y/n): ")
            if choice not in ['Y', 'N', 'n', 'y']:
                print("Bad input\nExiting...")
                time.sleep(3)
                exit_program()
            else:
                if choice == 'Y' or choice == 'y':
                    start()
                elif choice == 'N' or choice == 'n':
                    exit_program()
        
        elif select == 2:
            run_URL()
            choice = input("Do you want to search again? (y/n): ")
            if choice not in ['Y', 'N', 'n', 'y']:
                print("Bad input\nExiting...")
                time.sleep(3)
                exit_program()
            else:
                if choice == 'Y' or choice == 'y':
                    start()
                else:
                    exit_program()

        else:
            exit_program()
    else:
        print("Bad input\nExiting...")
        time.sleep(3)
        exit_program()

start()


 __  __       _                          ____       _            _             
|  \/  | __ _| |_      ____ _ _ __ ___  |  _ \  ___| |_ ___  ___| |_ ___  _ __ 
| |\/| |/ _` | \ \ /\ / / _` | '__/ _ \ | | | |/ _ \ __/ _ \/ __| __/ _ \| '__|
| |  | | (_| | |\ V  V / (_| | | |  __/ | |_| |  __/ ||  __/ (__| || (_) | |   
|_|  |_|\__,_|_| \_/\_/ \__,_|_|  \___| |____/ \___|\__\___|\___|\__\___/|_|   
                                                                               

 Welcome to antimalware detector 

 1. PE scanner
 2. URL scanner
 3. Exit



Enter your choice :  2
