In [2]:
import pickle
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
import re
def cleaning(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = re.sub(r'[^\w\s]', '', sentence.lower()).replace("\n", " ").split(" ")
    cleaned = [token for token in tokens if token not in stop_words]
    return " ".join(cleaned)
labels = ["insurance-etc","investment", "medical-sales", "phising", "sexual", "software-sales"]
text = []
classes = []
for label in labels:
    path = "/Users/lorraine/Desktop/Spam_Filter_old/Annotated/"+label
  
    os.chdir(path)
  
    def read_text_file(file_path):
        with open(file_path, 'r', encoding='ISO-8859-1') as f:
            return f.read()
      
    for file in os.listdir():
    
        if file.endswith(".txt"):
            file_path = f"{path}/{file}"
            text.append(cleaning(str(read_text_file(file_path))))
            classes.append(label)

data = pd.DataFrame({'sentence':text, 'label':classes})
data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorraine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lorraine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorraine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,sentence,label
0,subject take advantage low interest rates no...,insurance-etc
1,subject lowest rate us history sound let 6000...,insurance-etc
2,subject application pre approved monday oct 1...,insurance-etc
3,subject final offer wed 29 sep 2004 19 16 0...,insurance-etc
4,subject application confirmation sun 14 nov 2...,insurance-etc
...,...,...
2235,subject popular software low low prices bury ...,software-sales
2236,subject clean ur computer 3 ey 85 chance comp...,software-sales
2237,subject software incredibly low prices 73 lo...,software-sales
2238,subject latest qem software available low pri...,software-sales


In [3]:
def tfidf(word):
    sentence = data['sentence']
    idf = np.log(len(sentence)/sentence.str.contains(word).sum())
    result = []
    for i in range(len(sentence)):
        tf = sentence.iloc[i].count(word)/(len(sentence.iloc[i]))
        result.append(tf*idf)
    return result


In [4]:
import json
f = open('/Users/lorraine/Desktop/Spam_Filter_old/seedwords.json')
seeds = json.load(f)
result = pd.DataFrame()
for key, value in seeds.items():
    df = pd.DataFrame()
    for w in value:
        df[w] = tfidf(w)
    result[key] = df.sum(axis = 1)
result


Unnamed: 0,insurance-etc,investment,medical-sales,phising,sexual,software-sales
0,0.032326,0.011919,0.000000,0.000000,0.007120,0.000000
1,0.057216,0.003807,0.000000,0.000000,0.000000,0.000000
2,0.015567,0.000000,0.001333,0.000000,0.000000,0.000000
3,0.012817,0.006168,0.000000,0.003680,0.000000,0.022436
4,0.016901,0.000000,0.002614,0.000000,0.001915,0.000000
...,...,...,...,...,...,...
2235,0.000000,0.003564,0.000000,0.000000,0.000000,0.008678
2236,0.000000,0.000000,0.003047,0.000000,0.000000,0.000000
2237,0.000000,0.003709,0.002111,0.000000,0.000000,0.009031
2238,0.000000,0.002460,0.000000,0.010794,0.000000,0.077445


In [5]:
data["prediction"] = result.idxmax(1)
data

Unnamed: 0,sentence,label,prediction
0,subject take advantage low interest rates no...,insurance-etc,insurance-etc
1,subject lowest rate us history sound let 6000...,insurance-etc,insurance-etc
2,subject application pre approved monday oct 1...,insurance-etc,insurance-etc
3,subject final offer wed 29 sep 2004 19 16 0...,insurance-etc,software-sales
4,subject application confirmation sun 14 nov 2...,insurance-etc,insurance-etc
...,...,...,...
2235,subject popular software low low prices bury ...,software-sales,software-sales
2236,subject clean ur computer 3 ey 85 chance comp...,software-sales,medical-sales
2237,subject software incredibly low prices 73 lo...,software-sales,software-sales
2238,subject latest qem software available low pri...,software-sales,software-sales


In [6]:
# micro and macro F1 using tf-idf
from sklearn import metrics

In [7]:
metrics.f1_score(data["label"], data["prediction"], average="micro")

0.6924107142857143

In [8]:
metrics.f1_score(data["label"], data["prediction"], average="macro")

0.6666147160818625

In [11]:

import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
def preprocessing(sentence):
    tokens = sentence.split(" ")
    return [token for token in tokens if token!="" and token != " "]
features = data["sentence"].apply(preprocessing)
model = Word2Vec(sentences=features, size=110, window=5, min_count=1, workers=8)
#features

In [12]:
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
model.train(features, total_examples=len(data), epochs=800)
#vector = model.wv["atheism"]
#vector

(266582630, 555054400)

In [13]:
def get_vectors_per_label(filename):
    f = open(filename)
    seeds = json.load(f)
    vector_per_label = []
    for key, value in seeds.items():
        lst = []
        for w in value:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_label.append(total)
    return vector_per_label
vector_per_label = get_vectors_per_label('/Users/lorraine/Desktop/Spam_Filter_old/seedwords.json')

In [14]:
def get_vector_per_doc(feature):
    vector_per_doc = []
    for feat in feature:
        lst = []
        for w in feat:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_doc.append(total)
    return vector_per_doc
vector_per_doc = get_vector_per_doc(features)

In [15]:
len(vector_per_doc)

2240

In [16]:
f = open('/Users/lorraine/Desktop/Spam_Filter_old/seedwords.json')
seeds = json.load(f)
from numpy.linalg import norm
def predict_word2vec(vector_per_doc, vector_per_label):
    predictions = []
    labels = list(seeds.keys())
    for doc in vector_per_doc:
        cosine = []
        for label in vector_per_label:
            cosine.append(np.dot(doc,label)/(norm(doc)*norm(label)))
        max_value = max(cosine)
        max_index = cosine.index(max_value)
        predictions.append(labels[max_index])
    return predictions   
prediction_word2vec = predict_word2vec(vector_per_doc, vector_per_label)

In [17]:
data["prediction_word2vec"] = prediction_word2vec
data

Unnamed: 0,sentence,label,prediction,prediction_word2vec
0,subject take advantage low interest rates no...,insurance-etc,insurance-etc,insurance-etc
1,subject lowest rate us history sound let 6000...,insurance-etc,insurance-etc,insurance-etc
2,subject application pre approved monday oct 1...,insurance-etc,insurance-etc,insurance-etc
3,subject final offer wed 29 sep 2004 19 16 0...,insurance-etc,software-sales,insurance-etc
4,subject application confirmation sun 14 nov 2...,insurance-etc,insurance-etc,insurance-etc
...,...,...,...,...
2235,subject popular software low low prices bury ...,software-sales,software-sales,medical-sales
2236,subject clean ur computer 3 ey 85 chance comp...,software-sales,medical-sales,software-sales
2237,subject software incredibly low prices 73 lo...,software-sales,software-sales,medical-sales
2238,subject latest qem software available low pri...,software-sales,software-sales,software-sales


In [18]:
# micro and macro F1 using word2vec
metrics.f1_score(data["label"], data["prediction_word2vec"], average="micro")

0.7013392857142857

In [19]:
metrics.f1_score(data["label"], data["prediction_word2vec"], average="macro")

0.7160730458378426

In [20]:
sum(data["prediction_word2vec"] == data["label"])/len(data)

0.7013392857142857

In [None]:
import fasttext
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.4)
with open('spam-train.txt', 'w', encoding="utf-8") as f:
    for idx, row in train_set.iterrows():
        f.write("__label__" + row.label + " " + row.sentence + "\n")
model = fasttext.train_unsupervised(input='spam-train.txt', epoch=600, lr=0.05, wordNgrams=4, loss='hs', dim=40)
def get_vectors_per_label_fasttext(filename):
    f = open(filename)
    seeds = json.load(f)
    vector_per_label = []
    for key, value in seeds.items():
        lst = []
        for w in value:
            lst.append(model.get_word_vector(w))
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_label.append(total)
    return vector_per_label
vector_per_label = get_vectors_per_label_fasttext('/Users/lorraine/Desktop/Spam_Filter_old/seedwords.json')
def get_vector_per_doc_fasttext(feature):
    vector_per_doc = []
    for feat in feature:
        lst = []
        for w in feat:
            lst.append(model.get_word_vector(w))
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_doc.append(total)
    return vector_per_doc
vector_per_doc = get_vector_per_doc_fasttext(features)
def predict_fasttext(vector_per_doc, vector_per_label):
    predictions = []
    labels = list(seeds.keys())
    for doc in vector_per_doc:
        cosine = []
        for label in vector_per_label:
            cosine.append(np.dot(doc,label)/(norm(doc)*norm(label)))
        max_value = max(cosine)
        max_index = cosine.index(max_value)
        predictions.append(labels[max_index])
    return predictions   
prediction_fasttext = predict_fasttext(vector_per_doc, vector_per_label)
data["prediction_fasttext"] = prediction_fasttext

Read 0M words
Number of words:  11296
Number of labels: 9
Progress:  53.8% words/sec/thread:  101025 lr:  0.023100 avg.loss:  2.772606 ETA:   0h 3m41s5.7% words/sec/thread:  115967 lr:  0.047141 avg.loss:  6.393805 ETA:   0h 6m33s  8.3% words/sec/thread:  114374 lr:  0.045845 avg.loss:  6.525591 ETA:   0h 6m27s 29.5% words/sec/thread:  102861 lr:  0.035237 avg.loss:  4.737868 ETA:   0h 5m31s 103240 lr:  0.031819 avg.loss:  4.075275 ETA:   0h 4m58s 43.5% words/sec/thread:  102914 lr:  0.028235 avg.loss:  3.420052 ETA:   0h 4m25s

In [52]:
print(model.labels)



In [None]:
metrics.f1_score(data["label"], data["prediction_fasttext"], average="micro")

In [None]:
metrics.f1_score(data["label"], data["prediction_fasttext"], average="macro")