In [2]:
import pickle
import pandas as pd
import numpy as np
import os
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
  
lemmatizer = WordNetLemmatizer()
import re
def cleaning(sentence):
    stop_words = set(stopwords.words('english'))
    tokens = re.sub(r'[^\w\s]', '', sentence.lower()).replace("\n", " ").split(" ")
    cleaned = [token for token in tokens if token not in stop_words]
    return " ".join(cleaned)
labels = ["insurance-etc","investment", "medical-sales", "phising", "sexual", "software-sales"]
text = []
classes = []
for label in labels:
    path = os.getcwd()+"\\Annotated\\"+label
    print(path)
    os.chdir(path)
    
    def read_text_file(file_path):
        with open(file_path, 'r', encoding='ISO-8859-1') as f:
            return f.read()
      
    for file in os.listdir():
    
        if file.endswith(".txt"):
            file_path = f"{path}/{file}"
            text.append(cleaning(str(read_text_file(file_path))))
            classes.append(label)
    os.chdir("../..")
data = pd.DataFrame({'sentence':text, 'label':classes})
data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Garrett\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Garrett\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Garrett\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


E:\School\DSC180\Spam-Filter\Spam_Filter\Annotated\insurance-etc
E:\School\DSC180\Spam-Filter\Spam_Filter\Annotated\investment
E:\School\DSC180\Spam-Filter\Spam_Filter\Annotated\medical-sales
E:\School\DSC180\Spam-Filter\Spam_Filter\Annotated\phising
E:\School\DSC180\Spam-Filter\Spam_Filter\Annotated\sexual
E:\School\DSC180\Spam-Filter\Spam_Filter\Annotated\software-sales


Unnamed: 0,sentence,label
0,subject h ello dea 54 r home owner beetcn n...,insurance-etc
1,subject make 171 hello sent email ago quali...,insurance-etc
2,subject 96 refinance 2 9 hi would reflnance...,insurance-etc
3,subject 82 refinance today low 2 9 hey wou...,insurance-etc
4,subject 6 refinance today premium low rate he...,insurance-etc
...,...,...
2235,subject 2 question soft mult gua msof ilan...,software-sales
2236,subject 5 question progs mult gua msof ila...,software-sales
2237,subject 7 talks soft mult gua msof ilan ge...,software-sales
2238,subject dear sir interested hi need softwar...,software-sales


In [None]:
def tfidf(word):
    sentence = data['sentence']
    idf = np.log(len(sentence)/sentence.str.contains(word).sum())
    result = []
    for i in range(len(sentence)):
        tf = sentence.iloc[i].count(word)/(len(sentence.iloc[i]))
        result.append(tf*idf)
    return result


In [None]:
import json
f = open('seedwords.json')
seeds = json.load(f)
result = pd.DataFrame()
for key, value in seeds.items():
    df = pd.DataFrame()
    for w in value:
        df[w] = tfidf(w)
    result[key] = df.sum(axis = 1)
result


In [None]:
data["prediction"] = result.idxmax(1)
data

In [None]:
# micro and macro F1 using tf-idf
from sklearn import metrics

In [None]:
metrics.f1_score(data["label"], data["prediction"], average="micro")

In [None]:
metrics.f1_score(data["label"], data["prediction"], average="macro")

In [None]:
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
def preprocessing(sentence):
    tokens = sentence.split(" ")
    return [token for token in tokens if token!="" and token != " "]
features = data["sentence"].apply(preprocessing)
model = Word2Vec(sentences=features, vector_size=100, window=5, min_count=1, workers=4)
#features

In [None]:
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")
model.train(features, total_examples=len(data), epochs=20)
#vector = model.wv["atheism"]
#vector

In [None]:
def get_vectors_per_label(filename):
    f = open(filename)
    seeds = json.load(f)
    vector_per_label = []
    for key, value in seeds.items():
        lst = []
        for w in value:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_label.append(total)
    return vector_per_label
vector_per_label = get_vectors_per_label('seedwords.json')

In [None]:
def get_vector_per_doc(feature):
    vector_per_doc = []
    for feat in feature:
        lst = []
        for w in feat:
            lst.append(model.wv[w])
        arr = np.asarray(lst)
        total = np.average(arr, axis=0)
        vector_per_doc.append(total)
    return vector_per_doc
vector_per_doc = get_vector_per_doc(features)

In [None]:
f = open('seedwords.json')
seeds = json.load(f)
from numpy.linalg import norm
def predict_word2vec(vector_per_doc, vector_per_label):
    predictions = []
    labels = list(seeds.keys())
    for doc in vector_per_doc:
        cosine = []
        for label in vector_per_label:
            cosine.append(np.dot(doc,label)/(norm(doc)*norm(label)))
        max_value = max(cosine)
        max_index = cosine.index(max_value)
        predictions.append(labels[max_index])
    return predictions   
prediction_word2vec = predict_word2vec(vector_per_doc, vector_per_label)

In [None]:
data["prediction_word2vec"] = prediction_word2vec
data

In [None]:
# micro and macro F1 using word2vec
metrics.f1_score(data["label"], data["prediction_word2vec"], average="micro")

In [None]:
metrics.f1_score(data["label"], data["prediction_word2vec"], average="macro")

In [None]:
sum(data["prediction_word2vec"] == data["label"])/len(data)

## FastText

In [62]:
import fasttext
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.4)
with open('spam-train.txt', 'w', encoding="utf-8") as f:
    for idx, row in train_set.iterrows():
        f.write("__label__" + row.label + " " + row.sentence + "\n")
model = fasttext.train_supervised(input='spam-train.txt', epoch=25, lr =0.5, wordNgrams=4, loss='hs', dim=50)
fastText_df = pd.DataFrame(test_set.sentence)
fastText_df['label'] = test_set.label
preds = []
for sentence in test_set.sentence:
    pred = model.predict(sentence)
    label = pred[0][0].replace("__label__", "")
    preds.append(label)
test_set['pred'] = preds
test_set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['pred'] = preds


Unnamed: 0,sentence,label,pred
1413,subject c _ _ _ l _ _ soft tabs hi try revolu...,medical-sales,medical-sales
680,subject fw canyon 71 vicodin encamps nan...,medical-sales,medical-sales
1484,subject congratulations goldland lotto interna...,phising,phising
736,subject get pres cription filled right jq au...,medical-sales,medical-sales
1399,subject 2 discussion health ge ri ia ne cc ...,medical-sales,medical-sales
...,...,...,...
788,subject purchase v icodin online easily today...,medical-sales,medical-sales
1068,subject percocet fed medlcat 10 n v 1 c 0 ...,medical-sales,medical-sales
748,subject top quality medication tribune soil ac...,medical-sales,medical-sales
676,subject tynenol 3 codeine legal cheap rx med...,medical-sales,medical-sales


In [63]:
metrics.f1_score(test_set["label"], test_set["pred"], average="micro")

0.8772321428571429

In [64]:
metrics.f1_score(test_set["label"], test_set["pred"], average="macro")

0.7917269702653833

In [66]:
wrong = test_set[test_set['label'] != test_set['pred']]
wrong

Unnamed: 0,sentence,label,pred
1872,subject unleash animal use ring experience b...,sexual,medical-sales
1963,subject scan system adware wrongful 613032...,software-sales,medical-sales
2105,subject hp toronto promo products dec 04 plea...,software-sales,insurance-etc
1814,subject hello dear member welcome happy rep...,sexual,medical-sales
1816,subject anal 3 x please let image load clos...,sexual,medical-sales
...,...,...,...
352,subject reliable source prescription drugs wit...,investment,medical-sales
1856,subject utf 8 q young ang randy youn ...,sexual,medical-sales
89,subject right yet casualty anton prosodic ga...,insurance-etc,medical-sales
1895,subject largest collection porn mo ies ever ...,sexual,medical-sales


## Test Unannotated files

In [34]:
text_unannotated = []
path = os.getcwd() + "/enron6/spam"
os.chdir(path)
for file in os.listdir():
    if file.endswith(".txt"):
        file_path = f"{path}/{file}"
        text_unannotated.append(cleaning(str(read_text_file(file_path))))
os.chdir("../..")
data_unannotated = pd.DataFrame({'sentence':text})

In [35]:
preds = []
for sentence in data_unannotated.sentence:
    pred = model.predict(sentence)
    label = pred[0][0].replace("__label__", "")
    preds.append(label)
data_unannotated['pred'] = preds
data_unannotated

Unnamed: 0,sentence,pred
0,subject h ello dea 54 r home owner beetcn n...,medical-sales
1,subject make 171 hello sent email ago quali...,insurance-etc
2,subject 96 refinance 2 9 hi would reflnance...,insurance-etc
3,subject 82 refinance today low 2 9 hey wou...,insurance-etc
4,subject 6 refinance today premium low rate he...,insurance-etc
...,...,...
2235,subject 2 question soft mult gua msof ilan...,software-sales
2236,subject 5 question progs mult gua msof ila...,software-sales
2237,subject 7 talks soft mult gua msof ilan ge...,software-sales
2238,subject dear sir interested hi need softwar...,software-sales


In [36]:
data_unannotated.pred.value_counts()

medical-sales     1156
phising            352
software-sales     303
investment         173
insurance-etc      164
sexual              92
Name: pred, dtype: int64

## Exploring some of the predictions

In [37]:
print(data_unannotated.iloc[42].sentence)
print(data_unannotated.iloc[42].pred)

subject request sadie shansel cater ivortex reticulum incommensurate antipode  assassinate enzyme fabricate cofactor shanty  vise sabdomen polar taken  elapse anthem functionary  curate atlantis pfedora contention avaricious forfeiture tclairvoyant cane jmagnolia lowland kajar  curlew  hawaiian quillwort  oval respected member  winner summer ra e  give way program  please inform since winner offer one time opportunity lower interest r te 3  99 percent  get prize coupon id  2518 thank  valerie mcnally promotion department antisemitic lissajous  converse omission decay fcircumscribe verity  superstitious wallis drier crosswalk  aeolian  ostracism salesgirl dependent crass  defrock  centipede onto gfloodlit dolomitic jdauphine grizzle upstand  agnomen vienna  backspace vanish irresolvable qbarricade redshank  danbury agriculture snifter turpuvdqt eternal  countenance broody arnold scarburetor priory  tmoroccan zgkvuow dog identify transmissible cathedral colloquia sunburnt fcabot franchis

In [39]:
print(data_unannotated.iloc[1111].sentence)
print(data_unannotated.iloc[1111].pred)

subject   utf  8  q  might become     utf  8  q  advantageo     utf  8  q  us chap gi     utf  8  q  rl    products used heal erectile dysfunction  well far  famed inability copulate  someone cannot procure  retain  inflexible vertical member suitable intimate action  drugs  appropriate used execution enhancer vantage tablets function two days fabricate physical structure click buy 
medical-sales


In [40]:
print(data_unannotated.iloc[432].sentence)
print(data_unannotated.iloc[432].pred)

subject feel great time day summers 80  savings xanax  valium  phentermine  viagra email removal  go  alight impulse ingest impractical corrector beast postpone watershed audition midland stub entendre heinz fragile erickson barrel ymca clairenora provincial ridgepole absorb decelerate santa vertex decathlon posteriori dixon doherty wondrous cycad hawley airstrip especial cornucopia jugginghildebrand otherwise checksumming countersunk picasso laguerre mathematician cambrian invincible ballot brownian lactose nubia statuary cardiac sincere blanc sulfatebaleful eigenvector playwriting malarial nevins northward trickery blowfish impatient arrival cryptanalytic spearmint narbonne friar bathroom waistline cosine dioxidemcgrath wept altair elysee snook gardenia eclipse amorphous mendelevium bestubble honk allay escrow inertance peafowl
medical-sales


In [41]:
sexual = data_unannotated[data_unannotated['pred'] == 'sexual']
print(sexual.iloc[12].sentence)
print(sexual.iloc[12].pred)

subject lupe come watch   lube neeeed  let glide priiiiide  make feeeeed  come riiiiiiide       address solute inconvertible earnest message earthmoving frontage alpenstock experiential ordinance graves bufflehead darpa mobcap rodeo puzzle crewman penitentiary brahms inbreed tadpole delouse reciprocal aristocratic militia pet wakeup pence gully holland capo  blacken cutoff middlebury ramify bellboy austere axon ferguson adolph affable scriptural dead neve closeup passage tombstone refer sud amazon circuit clapboard chipboard apperception ama pheasant acyclic canis marion nelsen wave debauch thoreau fabricate bronco bebop burgundian aliquot sari automaton  
sexual


In [42]:
software = data_unannotated[data_unannotated['pred'] == 'software-sales']
print(software.iloc[9].sentence)
print(software.iloc[9].pred)

subject doctor contact  medicaldirectory  physiciansguid 7  000 hospitals  25  000 nursing homes 400  000 doctors united states health care database united states healthcare database comprehensive new product offered exclusively limited  time basis  complete database includes hospitals  hmo   group medical practices  nursing homes  physicians country  rapidly  changing industry  current healthcare information invaluable resource businesses organizations  united states healthcare database includes comprehensive information 7  000 hospitals  25  000 nursing homes 400  000 doctors mention hmos group medical practices  extensive reliable mailing list database key decision makers health care market  imagine increase marketing sales effectiveness made possible targeting key contacts name  reaching right decision maker critical success direct marketing campaigns  product  record indexed features name  address  phone fax  database available excel format cd rom  designed mailing lists merges  d

In [43]:
investment = data_unannotated[data_unannotated['pred'] == 'investment']
print(investment.iloc[29].sentence)
print(investment.iloc[29].pred)

subject stock profiier belanger  continue  important  expected  u c p  wi   large pr campaign next 10 days positive news expected  watch  jump board whiie stock beiow  1  huge promo weekend expected expect soar monday  tuesday next week  jump today  voice internet protoco   voip  service goes live symbol   u c p  current price   0  28 10 days target price   1  25 3 months target price   1  66  u c p  currentiy trading  0  28 headed  1  25 company reieased ground breaking news voip division   aithough would argue voip stil  maturing  corporate users extremeiy interested impiementing technoiogy  creating exponential growth  within last four years  voip minutes increased  ess 0  5 2 percent outbound international calis  according research telegeography  additionally  predictions size market vary    ied business inte   igence projecting voip market grow  3  7 bi   ion 2000  12  3 billion 2006  synergy research projecting voip equipment market grow  13  3 biliion 2005  uauthorize corporatio

In [16]:
phising = data_unannotated[data_unannotated['pred'] == 'phising']
print(phising.iloc[21].sentence)
print(phising.iloc[21].pred)

subject winning notification  lottery coordinator  international promotions  prize award department dear winner  results category   draws congratulations bring notice  results first category draws lucky strike lottery uk  happy inform emerged winner first category  part promotional draws  draws held day prior notification results officially announced  participants selected computer ballot system drawn 2  500  000 names  email addresses individuals companies africa  america  asia  australia  europe  middle east  oceania part international promotions program   company  attached ticket number 6422  5  486  serial number 59  18 drew lucky numbers 33  92  78  05  18  consequently first category  therefore awarded lump sum pay  6  500  000  six million  five hundred thousand great britain pounds   winning payout category winners  total prize money  13  000  000 shared among 2 winners category  congratulations  fund deposited transfer agents cash change uk ltd insured name  best interest also

## Seems good at predicting these unannotated files, but is it still overfitting to only this data?

In [44]:
model.predict("congrats you have won 1 million usd in the lottery from usa. please click here to earn your prize", k = 5)

(('__label__software-sales',
  '__label__insurance-etc',
  '__label__medical-sales',
  '__label__phising',
  '__label__insuranceetc'),
 array([0.47605869, 0.47433549, 0.02369787, 0.0191186 , 0.00666779]))

Wrong label, phising is all the way at the 4th contender!

In [47]:
model.predict("want to watch webcams of young teen girls to get your affair on today in bed", k = 5)

(('__label__medical-sales',
  '__label__sexual',
  '__label__medicalsales',
  '__label__software-sales',
  '__label__insurance-etc'),
 array([0.77838385, 0.1330519 , 0.03946874, 0.02377219, 0.01588757]))

Another wrong label, very confident this is medical???

## Work on spam/not spam

In [73]:
nonspam = []
path = os.getcwd() + "/enron6/ham"
os.chdir(path)
for file in os.listdir():
    if file.endswith(".txt"):
        file_path = f"{path}/{file}"
        #print(file_path)
        nonspam.append(cleaning(str(read_text_file(file_path))))
os.chdir("../..")
nonspam = pd.DataFrame({'sentence':nonspam})
nonspam['label'] = "ham"
nonspam

Unnamed: 0,sentence,label
0,subject key dates impact upcoming sap implemen...,ham
1,subject transportation resort please informed ...,ham
2,subject human resources organization enron con...,ham
3,subject want know today man new idea crank s...,ham
4,subject tw weekly 6 9 00 please see attache...,ham
...,...,...
1495,subject tw weekend scheduled volumes march 200...,ham
1496,subject fw ivanhoe e fyi kim origina...,ham
1497,subject fw abandoned pipe ownership fyi kim ...,ham
1498,subject fw tw question amarillo fyi kim ...,ham


In [97]:
nonspam.sentence.iloc[5]

'subject gss organizational changes pleased announce following additions enron global strategic sourcing operations group  effective june 1  tracy ramsey joined group sourcing portfolio leader cheryl slone joined group travel coordinator  new position  tracy responsible overseeing travel  entertainment conference  related sourcing activities  held various positions within travel industry recently served manager corporate travel enron property services corp  tracy report directly  cheryl joined enron 1998 travel coordinator  brings seven years travel industry experience new position  prior joining enron  recently worked travel management services   anderson cancer center  additionally  amanda becher joined group supply analyst  prior joining enron  held various positions inventory planning analysis  recently  served senior customer inventory analyst w  w  grainger  inc  cheryl holds b  b   operations management university houston  please join welcoming tracy  cheryl amanda group '

In [75]:
spam = []
path = os.getcwd() + "/enron6/spam"
os.chdir(path)
for file in os.listdir():
    if file.endswith(".txt"):
        file_path = f"{path}/{file}"
        spam.append(cleaning(str(read_text_file(file_path))))
os.chdir("../..")
spam = pd.DataFrame({'sentence':spam})
spam['label'] = "spam"
spam

Unnamed: 0,sentence,label
0,subject nasýnsýn ortaksat 07 aug 2004 23 08 ...,spam
1,subject f 0 r b 1 e n â â â l 0 l 1 â â â drea...,spam
2,subject carry bawek arly buczek,spam
3,subject 3 jpg joke day blonde went appliance ...,spam
4,subject today hey sheppard heard awhile want...,spam
...,...,...
2667,subject 5 question progs mult gua msof ila...,spam
2668,subject 7 talks soft mult gua msof ilan ge...,spam
2669,subject everything need beautiful hardwood flo...,spam
2670,subject reduzca sus gastos telefonicos empresa...,spam


## Try FastText with spam/nonspam

In [78]:
dat = pd.concat([nonspam, spam])

In [98]:
import fasttext
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(dat, test_size=0.4)
with open('spamham-train.txt', 'w', encoding="utf-8") as f:
    for idx, row in train_set.iterrows():
        f.write("__label__" + row.label +  " " + row.sentence + "\n")
model = fasttext.train_supervised(input='spamham-train.txt', epoch=25, lr =0.1, wordNgrams=2, loss='hs', dim=50)
preds = []
for sentence in test_set.sentence:
    pred = model.predict(sentence)
    label = pred[0][0].replace("__label__", "")
    preds.append(label)
test_set['pred'] = preds
test_set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['pred'] = preds


Unnamed: 0,sentence,label,pred
669,subject enron bids farewell hpl enron sold hou...,ham,ham
916,subject learn get freedom freedom choice pr...,spam,spam
424,subject weekly throughput report week october ...,ham,ham
569,subject california brink cera alert ...,ham,ham
1017,subject etc event schlitterbahn good news ...,ham,ham
...,...,...,...
144,subject urgent mr mandisi bongani mabuto e ...,spam,spam
233,subject utf 8 q rolex order deta utf...,spam,spam
296,subject enron action 09 25 00 chairman awar...,ham,ham
1557,subject save us rafael swiss pharmacy online w...,spam,spam


In [99]:
test_set.pred.value_counts()

spam    1091
ham      578
Name: pred, dtype: int64

In [100]:
metrics.f1_score(test_set["label"], test_set["pred"], average="micro")

0.9784301977231875

In [101]:
metrics.f1_score(test_set["label"], test_set["pred"], average="macro")

0.9764586219650571

In [109]:
model.predict("Don't forget the work meeting on 5/12 for our employers! We are looking forward to having our vendors at this meeting.", k = 2)

(('__label__spam', '__label__ham'), array([0.6627382 , 0.33728182]))