In [18]:
phishing_data = '../data/phish/'
legitimate_data = '../data/legit/'

In [19]:
import os, re, string
import numpy as np

In [20]:
def clean_text(text):
    text = text.decode('utf-8')
    while '\n' in text:
        text = text.replace('\n', ' ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    words = text.split()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    stripped = []
    for token in words: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            stripped.append(new_token.lower())
    text = ' '.join(stripped)
    return text

In [21]:
def get_data(path):
    text_list = list()
    files = os.listdir(path)
    for text_file in files:
        file_path = os.path.join(path, text_file)
        read_file = open(file_path,'r+')
        read_text = read_file.read()
        read_file.close()
        cleaned_text = clean_text(read_text)
        text_list.append(cleaned_text)
    return text_list, files

In [22]:
no_head_train_0, temp = get_data(phishing_data)
no_head_train_1, temp = get_data(legitimate_data)

In [23]:
no_head_train = no_head_train_0 + no_head_train_1
no_head_labels_train = ([0] * len(no_head_train_0)) + ([1] * len(no_head_train_1))

## get data statistics

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
tf_vectorizer = CountVectorizer()
X = tf_vectorizer.fit_transform(no_head_train)

In [26]:
print ('#total words', np.matrix.sum(X.todense()))
print ('#unique words',len(set(tf_vectorizer.get_feature_names())))

('#total words', 166433)
('#unique words', 23095)


In [27]:
def vocabularymat(TEXTFILES,VOC,PLAY,METHOD):
    
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
    if (METHOD == "TFIDF"):
        voc = TfidfVectorizer()
        voc.fit(VOC)
    
        if (PLAY == "TRAIN"):
            TrainMat = voc.transform(TEXTFILES)
            return TrainMat

        if (PLAY =="TEST"):
            TestMat = voc.transform(TEXTFILES)
            return TestMat
    
    if (METHOD == "TDM"):
        voc = CountVectorizer()
        voc.fit(VOC)
    
        if (PLAY == "TRAIN"):
            TrainMat = voc.transform(TEXTFILES)
            return TrainMat

        if (PLAY =="TEST"):
            TestMat = voc.transform(TEXTFILES)
            return TestMat

In [28]:
TrainMat = vocabularymat(no_head_train,no_head_train,PLAY= "TRAIN",METHOD="TDM")

In [29]:
data = TrainMat.todense()
datalabel = no_head_labels_train

Traindata = data

In [30]:
def Featurelearning(Data, Method):
    from sklearn.decomposition import TruncatedSVD, NMF
    if (Method == "SVD"):
        model = TruncatedSVD(n_components=30, n_iter=7, random_state=42)
        Matrix = model.fit_transform(Data)
    if (Method == "NMF"):
        model = NMF(n_components=30, init='random', random_state=0)
        Matrix = model.fit_transform(Data)
    return Matrix

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train = Featurelearning(Traindata, Method="NMF")
y_train = datalabel

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [34]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (precision_score, recall_score,f1_score, accuracy_score,mean_squared_error,mean_absolute_error)

model = LogisticRegression()
model.fit(X_train, y_train)

# make predictions
expected = y_test
predicted = model.predict(X_test)

accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted, average="binary")
precision = precision_score(expected, predicted , average="binary")
f1 = f1_score(expected, predicted , average="binary")

print(model)
print("Accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("recall")
print("%.3f" %recall)
print("f-score")
print("%.3f" %f1)

cm = metrics.confusion_matrix(expected, predicted)
print(cm)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy
0.838
precision
0.816
recall
0.882
f-score
0.848
[[167  44]
 [ 26 195]]
