# install fasttext==0.8.3

In [None]:
phishing_data = '../data/phish/'
legitimate_data = '../data/legit/'

In [2]:
import os, re, string
import numpy as np

In [3]:
def clean_text(text):
    text = text.decode('utf-8')
    while '\n' in text:
        text = text.replace('\n', ' ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    words = text.split()
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    stripped = []
    for token in words: 
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            stripped.append(new_token.lower())
    text = ' '.join(stripped)
    return text

In [4]:
def get_data(path):
    text_list = list()
    files = os.listdir(path)
    for text_file in files:
        file_path = os.path.join(path, text_file)
        read_file = open(file_path,'r+')
        read_text = read_file.read()
        read_file.close()
        cleaned_text = clean_text(read_text)
        text_list.append(cleaned_text)
    return text_list, files

In [5]:
no_head_train_0, temp = get_data(phishing_data)
no_head_train_1, temp = get_data(legitimate_data)

In [6]:
no_head_train = no_head_train_0 + no_head_train_1
no_head_labels_train = ([0] * len(no_head_train_0)) + ([1] * len(no_head_train_1))

In [7]:
## data statistics

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
tf_vectorizer = CountVectorizer()
X = tf_vectorizer.fit_transform(no_head_train)

In [10]:
print ('#total words', np.matrix.sum(X.todense()))
print ('#unique words',len(set(tf_vectorizer.get_feature_names())))

('#total words', 166433)
('#unique words', 23095)


In [11]:
shuffled_indices = np.random.permutation(len(no_head_labels_train))
train_data = np.array(no_head_train)[shuffled_indices]
train_data = train_data.tolist()
train_label = np.array(no_head_labels_train)[shuffled_indices]
train_label = train_label.tolist()

In [12]:
temp_train_data = train_data[0:int(0.8*len(train_data))]
temp_train_label = train_label[0:int(0.8*len(train_label))]
temp_test_data = train_data[int(0.8*len(train_data)):]
temp_test_labels = train_label[int(0.8*len(train_label)):]

In [13]:
fast_train_file = '../data/fast_train.txt'
fast_test_file = '../data/fast_test.txt'
writeFile = open(fast_train_file, 'w')
for text, label in zip(temp_train_data, temp_train_label):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

writeFile = open(fast_test_file, 'w')
for text, label in zip(temp_test_data, temp_test_labels):
    writeFile.write('__label__'+str(label)+' '+str(text.encode('utf-8'))+'\n')
writeFile.close()

In [14]:
import os, re, string
import numpy as np
import fasttext

In [16]:
classifier = fasttext.supervised(fast_train_file, 'trained_model')

In [18]:
result = classifier.test(fast_test_file)

In [19]:
result.precision, result.recall, result.nexamples

(0.9122137404580153, 0.9122137404580153, 262)

In [20]:
print (classifier.min_count, classifier.dim, classifier.epoch, classifier.word_ngrams, classifier.encoding, classifier.loss_name, classifier.maxn, classifier.t)

(1, 100, 5, 1, 'utf-8', u'softmax', 0, 0.0001)
