In [7]:
import fasttext
import numpy as np
import json
from sklearn.model_selection import train_test_split
from numpy.linalg import norm
from sklearn import metrics

# from src.features.features import *
# from src.data.generate_data import *

In [8]:
def get_features(data):
    features = data['sentence'].apply(preprocessing)
    return features

def preprocessing(sentence):
    tokens = sentence.split(" ")
    return [token for token in tokens if token!="" and token != " "]

def cleaning(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = re.sub(r'[^\w\s]', '', sentence.lower()).replace("\n", " ").split(" ")
    cleaned = [token for token in tokens if token not in stop_words]
    return " ".join(cleaned)

In [9]:
import os
import pandas as pd


def read_data(target):
    labels = ["insurance-etc","investment", "medical-sales", "phising", "sexual", "software-sales"]
    text = []
    classes = []
    for cla in labels:
        path = "data/raw/spam/Annotated/"
        if target == 'test':
            path = "test/testdata/"
        all_files = os.listdir(path + cla)
        for fil in all_files:
            if fil.endswith(".txt"):
                file_path = path + cla + "/" + fil
                with open(file_path, 'r', encoding='ISO-8859-1') as f:
                    text.append(cleaning(str(f.read())))
                    classes.append(cla)
    return pd.DataFrame({'sentence':text, 'label':classes})



def generate_fasttext_train_data(train_set):
    with open('data/out/spam-train.txt', 'w', encoding="utf-8") as f:
        for idx, row in train_set.iterrows():
            f.write("__label__" + row.label + " " + row.sentence + "\n")

In [10]:
def FastText(target):
    data = read_data(target)

    train_set, test_set = train_test_split(data, test_size=0.4)
    generate_fasttext_train_data(train_set)
    model = fasttext.train_unsupervised(input='data/out/spam-train.txt', epoch=600, lr=0.05, wordNgrams=4, loss='hs', dim=40)

    filename = 'data/out/seedwords.json'
    vec_per_label = get_vectors_per_label(filename, model)
    features = get_features(data)
    vec_per_doc = get_vector_per_doc(features, model)
    labels = data['label']
    pred = predict(vec_per_doc, vec_per_label, filename)
    return labels, pred

def get_vectors_per_label(filename, model):
    f = open(filename)
    seeds = json.load(f)
    vector_per_label = []
    for key, value in seeds.items():
        lst = []
        for w in value:
            lst.append(model.get_word_vector(w))
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_label.append(total)
    return vector_per_label

def get_vector_per_doc(feature, model):
    vector_per_doc = []
    for feat in feature:
        lst = []
        for w in feat:
            lst.append(model.get_word_vector(w))
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_doc.append(total)
    return vector_per_doc

def predict(vector_per_doc, vector_per_label, filename):
    predictions = []
    f = open(filename)
    seeds = json.load(f)
    labels = list(seeds.keys())
    for doc in vector_per_doc:
        cosine = []
        for label in vector_per_label:
            cosine.append(np.dot(doc,label)/(norm(doc)*norm(label)))
        max_value = max(cosine)
        max_index = cosine.index(max_value)
        predictions.append(labels[max_index])
    return predictions   


def get_accuracy(pred, label):
    micro = metrics.f1_score(label, pred, average="micro")
    macro = metrics.f1_score(label, pred, average="macro")
    return micro, macro


In [11]:
target = 'test'
pred, label = FastText(target)
micro, macro = get_accuracy(pred, label)
print('Micro F1 = ' + str(micro))
print('Macro F1 = ' + str(macro))

FileNotFoundError: [Errno 2] No such file or directory: 'test/testdata/insurance-etc'