In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import plot_tree

import matplotlib.pyplot as plt
import re

In [3]:
df = pd.read_csv('new_data_urls.csv')
df.head()

Unnamed: 0,url,status
0,0000111servicehelpdesk.godaddysites.com,0
1,000011accesswebform.godaddysites.com,0
2,00003.online,0
3,0009servicedeskowa.godaddysites.com,0
4,000n38p.wcomhost.com,0


In [4]:
print(len(df[df['status']==0]), len(df[df['status']==1]))
df_maj, df_min = df[df['status']==1], df[df['status']==0]
df_maj_sampled = df_maj.sample(len(df_min), random_state=42)
df_balanced = pd.concat([df_maj_sampled, df_min])
print(len(df_balanced[df_balanced['status']==0]), len(df_balanced[df_balanced['status']==1]))
df_balanced.reset_index(inplace=True, drop=True)
df_balanced

394982 427028
394982 394982


Unnamed: 0,url,status
0,barrybartlett.net/,1
1,archive.org/stream/cihm_03838/cihm_03838_djvu.txt,1
2,strategyfirst.net/,1
3,www.macminute.com/headlines.xml,1
4,vrealities.com/sensio.html,1
...,...,...
789959,zzufg.com,0
789960,zzu.li,0
789961,zzz.co.uk,0
789962,zzzoolight.co.za,0


In [5]:
def tok(string) -> str:
    return string.replace('/', '.').split('.')

def to_txt(text) -> str:
    return text.replace('.', ' ').replace('/', ' ')

In [6]:
def top_terms(df, n) -> list[str]:
    term = {}
    for url, status in df.values:
        for word in tok(url):
            if word != '':
                if word not in term.keys():
                    term[word] = 0

                term[word]+=1
            
    return [t[0] for t in sorted(term.items(), key=lambda x:x[1], reverse=True)[:n]]

In [7]:
def num_digits(text) -> int:
    return len(re.findall('\d', text))

def num_dots(text) -> int:
    return len(re.findall('\.', text))

def num_bar(text) -> int:
    return len(re.findall('/', text))

  return len(re.findall('\d', text))
  return len(re.findall('\.', text))


In [8]:
VOC = top_terms(df_balanced, n=10)
VOC

['com', 'https:', 'www', 'html', 'http:', 'org', 'net', 'cn', 'php', 'index']

In [9]:
CORPUS = [to_txt(url) for url in df_balanced.url]
CORPUS[101]

'answers com topic leveille'

In [10]:
vectorizer = CountVectorizer(binary=True, vocabulary=VOC)
docTermMatrix = vectorizer.fit_transform(CORPUS)

matrix = pd.DataFrame(docTermMatrix.toarray(), columns=VOC)
matrix['dots'] = [num_dots(text) for text in df_balanced.url]
matrix['bar'] = [num_bar(text) for text in df_balanced.url]
matrix['len'] = [len(text) for text in CORPUS]
matrix['digits'] = [num_digits(text) for text in CORPUS]
matrix

Unnamed: 0,com,https:,www,html,http:,org,net,cn,php,index,dots,bar,len,digits
0,0,0,0,0,0,0,1,0,0,0,1,1,18,0
1,0,0,0,0,0,1,0,0,0,0,2,3,49,10
2,0,0,0,0,0,0,1,0,0,0,1,1,18,0
3,1,0,1,0,0,0,0,0,0,0,3,1,31,0
4,1,0,0,1,0,0,0,0,0,0,2,1,26,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
789959,1,0,0,0,0,0,0,0,0,0,1,0,9,0
789960,0,0,0,0,0,0,0,0,0,0,1,0,6,0
789961,0,0,0,0,0,0,0,0,0,0,2,0,9,0
789962,0,0,0,0,0,0,0,0,0,0,2,0,16,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(matrix.values, df_balanced['status'].values, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [12]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8821846537504826

In [13]:
confusion = confusion_matrix(y_test, y_pred)
confusion

array([[69219,  9785],
       [ 8829, 70160]], dtype=int64)

In [14]:
import pickle
with open('links_classify.pkl', 'wb') as f:
    pickle.dump(clf, f)