# Without oneAPI Tools
- to have a look at the comparison between regular OneAPI and regular libraries 

#### Importing all the required libraries

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

#### Importing the Dataset
### Source : 

In [2]:
url = '../Dataset/data_url.csv'
url_csv = pd.read_csv(url,delimiter=',')

#converting the data from csv to dataframe for easy handling
url_df = pd.DataFrame(url_csv)

#to convert into array 
url_df = np.array(url_df)  
random.shuffle(url_df)

#### Seperating the data according to it's characteristics

In [3]:
y = [d[1] for d in url_df]                 
urls = [d[0] for d in url_df]

#### Since the urls are different from our normal text documents, we need to use a sanitization method to get the relevant data from raw urls.

In [4]:
def sanitization(web):
    web = web.lower()
    token = []
    dot_token_slash = []
    raw_slash = str(web).split('/')
    for i in raw_slash:
        # removing slash to get token
        raw1 = str(i).split('-')
        slash_token = []
        for j in range(0,len(raw1)):
            # removing dot to get the tokens
            raw2 = str(raw1[j]).split('.')
            slash_token = slash_token + raw2
        dot_token_slash = dot_token_slash + raw1 + slash_token
    # to remove same words
    token = list(set(dot_token_slash))  
    if 'com' in token:
        #remove com
        token.remove('com')
    return token

#### We will have to pass the data to our custom vectorizer function using Tf-idf approach 

In [5]:
# term-frequency and inverse-document-frequency
vectorizer = TfidfVectorizer(tokenizer=sanitization)

#### Splitting the test set and train set

In [6]:
x = vectorizer.fit_transform(urls)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)



#### Training

In [7]:
lgr = LogisticRegression(solver='lbfgs', max_iter=1000)                  # Logistic regression
lgr.fit(x_train, y_train)
score = lgr.score(x_test, y_test)
print("score: {0:.2f} %".format(100 * score))
vectorizer_save = vectorizer

score: 98.34 %


#### Saving the modle and vectors

In [8]:
file = "pickel_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgr, f)
f.close()

file2 = "pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)
f2.close()

# With OneAPI tools

In [9]:
import modin.pandas as md

In [10]:
## without modin 

url = '../Dataset/data_url.csv'

%timeit url_csv = pd.read_csv(url,delimiter=',')
%timeit url_df = pd.DataFrame(url_csv)

#to convert into array 
url_df = np.array(url_df)  
random.shuffle(url_df)

347 ms ± 18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
3.45 µs ± 16.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [16]:
## with modin 

## just to ignore the warning

url = '../Dataset/data_url.csv'

%timeit url_csv = md.read_csv(url,delimiter=',') #270 ms ± 13.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


243 ms ± 8.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit url_df = md.DataFrame(url_csv) 

#39.7 ms ± 1.87 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
#This result was absorbed when `url_csv = md.read_csv(url,delimeter=',')`

# But for what we actually got is 
# 189 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# So this is a result of using modin with pandas for creating a dataFrame 
# Which is a notable conclusion for our specific case, cause in real-life developers,may shuffle around modules,in that case this is notable observation



189 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


##### In the above case , it's quite contradicting we can clearly see that modin won in the `read_csv` part by almost saving 450 - 500 ms but when it is coming to `DataFrame` it *modin* failed misearbly

In [19]:
from sklearnex.linear_model import LogisticRegression as lgr_i
from sklearnex.model_selection import train_test_split as tts_i

In [None]:
# only using scikit learn

%timeit x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

186 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### here performances end up at tie 

In [None]:
# using train_test_split function from sklearnex

%timeit x_train, x_test, y_train, y_test = tts_i(x, y, test_size=0.2, random_state=42)

187 ms ± 9.76 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
## using scikit learn

lgr = LogisticRegression(solver='lbfgs', max_iter=1000)                  # Logistic regression
lgr.fit(x_train, y_train)
score = lgr.score(x_test, y_test)
print("score: {0:.2f} %".format(100 * score))
vectorizer_save = vectorizer


%timeit lgr.fit(x_train, y_train) 

#(reasons unknown)
#3.66 s ± 502 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


score: 98.34 %
4.04 s ± 473 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
## using sklearnex

lgr_1 = lgr_i(solver='lbfgs', max_iter=1000)                  # Logistic regression
lgr_1.fit(x_train, y_train)
score = lgr_1.score(x_test, y_test)
vectorizer_save = vectorizer

%timeit lgr_1.fit(x_train, y_train) #818 ms ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


818 ms ± 147 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### In the above case of `LogisticRegression` the method from sklearnex module outperformed scikit-learn by almost 3 times

In [22]:
file = "pickle_files/pickel_model.pkl"
with open(file, 'wb') as f:
    pickle.dump(lgr, f)
f.close()

file2 = "pickle_files/pickel_vector.pkl"
with open(file2,'wb') as f2:
    pickle.dump(vectorizer_save, f2)
f2.close()

## Perfomance metrix with OneAPI tools