In [1]:
import pickle
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()
import re
def cleaning(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = re.sub(r'[^\w\s]', '', sentence.lower()).replace("\n", " ").split(" ")
    cleaned = [token for token in tokens if token not in stop_words]
    return " ".join(cleaned)
labels = ["insurance-etc","investment", "medical-sales", "phising", "sexual", "software-sales"]
text = []
classes = []
for label in labels:
    path = "/Users/lorraine/Desktop/Spam_Filter/Annotated/"+label
  
    os.chdir(path)
  
    def read_text_file(file_path):
        with open(file_path, 'r', encoding='ISO-8859-1') as f:
            return f.read()
      
    for file in os.listdir():
    
        if file.endswith(".txt"):
            file_path = f"{path}/{file}"
            text.append(cleaning(str(read_text_file(file_path))))
            classes.append(label)

data = pd.DataFrame({'sentence':text, 'label':classes})
data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lorraine/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lorraine/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lorraine/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,sentence,label
0,subject take advantage low interest rates no...,insurance-etc
1,subject lowest rate us history sound let 6000...,insurance-etc
2,subject application pre approved monday oct 1...,insurance-etc
3,subject final offer wed 29 sep 2004 19 16 0...,insurance-etc
4,subject application confirmation sun 14 nov 2...,insurance-etc
...,...,...
2235,subject popular software low low prices bury ...,software-sales
2236,subject clean ur computer 3 ey 85 chance comp...,software-sales
2237,subject software incredibly low prices 73 lo...,software-sales
2238,subject latest qem software available low pri...,software-sales


In [2]:
def tfidf(word):
    sentence = data['sentence']
    idf = np.log(len(sentence)/sentence.str.contains(word).sum())
    result = []
    for i in range(len(sentence)):
        tf = sentence.iloc[i].count(word)/(len(sentence.iloc[i]))
        result.append(tf*idf)
    return result


In [3]:
import json
f = open('/Users/lorraine/Desktop/Spam_Filter/seedwords.json')
seeds = json.load(f)
result = pd.DataFrame()
for key, value in seeds.items():
    df = pd.DataFrame()
    for w in value:
        df[w] = tfidf(w)
    result[key] = df.sum(axis = 1)
result


Unnamed: 0,insurance-etc,investment,medical-sales,phising,sexual,software-sales
0,0.032326,0.011919,0.000000,0.000000,0.007120,0.000000
1,0.057216,0.003807,0.000000,0.000000,0.000000,0.000000
2,0.015567,0.000000,0.001333,0.000000,0.000000,0.000000
3,0.012817,0.006168,0.000000,0.003680,0.000000,0.022436
4,0.016901,0.000000,0.002614,0.000000,0.001915,0.000000
...,...,...,...,...,...,...
2235,0.000000,0.003564,0.000000,0.000000,0.000000,0.008678
2236,0.000000,0.000000,0.003047,0.000000,0.000000,0.000000
2237,0.000000,0.003709,0.002111,0.000000,0.000000,0.009031
2238,0.000000,0.002460,0.000000,0.010794,0.000000,0.077445


In [4]:
data["prediction"] = result.idxmax(1)
data

Unnamed: 0,sentence,label,prediction
0,subject take advantage low interest rates no...,insurance-etc,insurance-etc
1,subject lowest rate us history sound let 6000...,insurance-etc,insurance-etc
2,subject application pre approved monday oct 1...,insurance-etc,insurance-etc
3,subject final offer wed 29 sep 2004 19 16 0...,insurance-etc,software-sales
4,subject application confirmation sun 14 nov 2...,insurance-etc,insurance-etc
...,...,...,...
2235,subject popular software low low prices bury ...,software-sales,software-sales
2236,subject clean ur computer 3 ey 85 chance comp...,software-sales,medical-sales
2237,subject software incredibly low prices 73 lo...,software-sales,software-sales
2238,subject latest qem software available low pri...,software-sales,software-sales


In [5]:
# micro and macro F1 using tf-idf
from sklearn import metrics

In [6]:
metrics.f1_score(data["label"], data["prediction"], average="micro")

0.6924107142857143

In [7]:
metrics.f1_score(data["label"], data["prediction"], average="macro")

0.6666147160818625

In [8]:

import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
def preprocessing(sentence):
    tokens = sentence.split(" ")
    return [token for token in tokens if token!="" and token != " "]
features = data["sentence"].apply(preprocessing)
model = Word2Vec(sentences=features, size=50, window=5, min_count=1, workers=4)
#features

In [9]:
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
model.train(features, total_examples=len(data), epochs=100)
#vector = model.wv["atheism"]
#vector

(33331433, 49235200)

In [10]:
def get_vectors_per_label(filename):
    f = open(filename)
    seeds = json.load(f)
    vector_per_label = []
    for key, value in seeds.items():
        lst = []
        for w in value:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_label.append(total)
    return vector_per_label
vector_per_label = get_vectors_per_label('/Users/lorraine/Desktop/Spam_Filter/seedwords.json')

In [11]:
def get_vector_per_doc(feature):
    vector_per_doc = []
    for feat in feature:
        lst = []
        for w in feat:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_doc.append(total)
    return vector_per_doc
vector_per_doc = get_vector_per_doc(features)

In [12]:
len(vector_per_doc)

2240

In [13]:
f = open('/Users/lorraine/Desktop/Spam_Filter/seedwords.json')
seeds = json.load(f)
from numpy.linalg import norm
def predict_word2vec(vector_per_doc, vector_per_label):
    predictions = []
    labels = list(seeds.keys())
    for doc in vector_per_doc:
        cosine = []
        for label in vector_per_label:
            cosine.append(np.dot(doc,label)/(norm(doc)*norm(label)))
        max_value = max(cosine)
        max_index = cosine.index(max_value)
        predictions.append(labels[max_index])
    return predictions   
prediction_word2vec = predict_word2vec(vector_per_doc, vector_per_label)

In [14]:
data["prediction_word2vec"] = prediction_word2vec
data

Unnamed: 0,sentence,label,prediction,prediction_word2vec
0,subject take advantage low interest rates no...,insurance-etc,insurance-etc,insurance-etc
1,subject lowest rate us history sound let 6000...,insurance-etc,insurance-etc,insurance-etc
2,subject application pre approved monday oct 1...,insurance-etc,insurance-etc,insurance-etc
3,subject final offer wed 29 sep 2004 19 16 0...,insurance-etc,software-sales,insurance-etc
4,subject application confirmation sun 14 nov 2...,insurance-etc,insurance-etc,insurance-etc
...,...,...,...,...
2235,subject popular software low low prices bury ...,software-sales,software-sales,medical-sales
2236,subject clean ur computer 3 ey 85 chance comp...,software-sales,medical-sales,software-sales
2237,subject software incredibly low prices 73 lo...,software-sales,software-sales,medical-sales
2238,subject latest qem software available low pri...,software-sales,software-sales,software-sales


In [15]:
# micro and macro F1 using word2vec
metrics.f1_score(data["label"], data["prediction_word2vec"], average="micro")

0.6017857142857143

In [16]:
metrics.f1_score(data["label"], data["prediction_word2vec"], average="macro")

0.6208801067340314

In [17]:
sum(data["prediction_word2vec"] == data["label"])/len(data)

0.5962483251451541

In [45]:
import fasttext
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.4)
with open('spam-train.txt', 'w', encoding="utf-8") as f:
    for idx, row in train_set.iterrows():
        f.write("__label__" + row.label + " " + row.sentence + "\n")
model = fasttext.train_supervised(input='spam-train.txt', epoch=500, lr=0.5, wordNgrams=3, loss='hs', dim=50)
fastText_df = pd.DataFrame(test_set.sentence)
fastText_df['label'] = test_set.label
preds = []
for sentence in test_set.sentence:
    pred = model.predict(sentence)
    #try:
    label = pred[0][0].replace("__label__", "")
    preds.append(label)
    #except:
    #    # cant predict? Predict most common label 
    #    preds.append('medical-sales')
test_set['pred'] = preds
test_set

Read 0M words
Number of words:  29150
Number of labels: 6
Progress: 100.0% words/sec/thread: 1614532 lr:  0.000000 avg.loss:  0.027763 ETA:   0h 0m 0s100.0% words/sec/thread: 1614554 lr: -0.000010 avg.loss:  0.027763 ETA:   0h 0m 0s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['pred'] = preds


Unnamed: 0,sentence,label,prediction,prediction_word2vec,pred
427,subject reviving sex lives millions online dru...,medical-sales,sexual,medical-sales,medical-sales
476,subject peak performance longz capsules featur...,medical-sales,sexual,sexual,medical-sales
1275,subject 4 talk pills spu r th ewe saf twa ...,medical-sales,medical-sales,software-sales,medical-sales
2210,subject fwd software download update os look...,software-sales,software-sales,software-sales,software-sales
1099,subject iagra 113 cialis 113 llp 1 tor ...,medical-sales,medical-sales,medical-sales,medical-sales
...,...,...,...,...,...
1984,subject summer macromedia mlcros 0 ft symann...,software-sales,software-sales,software-sales,software-sales
583,subject refill notification ref wx 339097195...,medical-sales,software-sales,sexual,medical-sales
232,subject investors microcap profiie sandoval ...,investment,investment,investment,investment
1284,subject lose weight new weightloss loses 19 ...,medical-sales,sexual,insurance-etc,medical-sales


In [46]:
print(model.labels)

['__label__medical-sales', '__label__phising', '__label__software-sales', '__label__investment', '__label__insurance-etc', '__label__sexual']


In [47]:
metrics.f1_score(test_set["label"], test_set["pred"], average="micro")

0.9073660714285715

In [48]:
metrics.f1_score(test_set["label"], test_set["pred"], average="macro")

0.8606963390783315