In [1]:
import re
import nltk
import os
import numpy as np
import codecs
import multiprocessing as mp
import pandas as pd
from nltk import stem

stemmer = stem.PorterStemmer()

from nltk.corpus import stopwords
from collections import Counter
from typing import Dict, List, Tuple

# find the path of emails
def listdir(directory: str) -> List:
    """
    A specialized version of os.listdir() that ignores files that
    start with a leading period.
    
    Especially dismissing .DS_STORE s.
    """
    filelist = os.listdir(directory)
    return [x for x in filelist if not (x.startswith('.'))]

stopwords = stopwords.words('english')
cut_model = nltk.WordPunctTokenizer()


# Preprocess emails
def enron_processor(emails_dir: str, return_list: list) -> list:    
    """
    * remove numbers
    * remove stopwords
    * add lables
    """
    dirs = [os.path.join(emails_dir, f) for f in os.listdir(emails_dir)]
    for d in dirs:
        emails = [os.path.join(d, f) for f in os.listdir(d)]
        for mail in emails:
            # print(mail)
            with codecs.open(mail, "rb", encoding='utf_8_sig', errors='ignore') as m:
                email_list = []
                line_str = ""
                for line in m:
                    for word in line:
                        if word.startswith("http"):
                            print(word)
                            word = "URL"
                            print(word)
                        word = stemmer.stem(word)
                    line = re.sub(r'[^a-zA-Z\s]', '', string=line)
                    line = line.lower()
                    line = line.strip()
                    tokens = cut_model.tokenize(line)
                    line = [stemmer.stem(token) for token in tokens if token not in stopwords]
           
                    line = ' '.join(line)
                    line_str = line_str+line+" "
                email_list.append(line_str)
                
                if mail.split(".")[-2] == 'spam':
                    email_list.append("spam")
                else: 
                    email_list.append("ham")
                email_list.append(mail)
                return_list.append(email_list)
                

                
return_dict = {}   
root_dir = 'spampy/datasets/enron'
emails_dirs = [os.path.join(root_dir, f) for f in listdir(root_dir)]
return_list = []
for emails_dir in emails_dirs:
    enron_processor(emails_dir, return_list)


messages = pd.DataFrame(return_list, columns=['message', 'label', 'path'])

messages

Unnamed: 0,message,label,path
0,christma tree farm pictur,ham,spampy/datasets/enron\enron1\ham\0001.1999-12-...
1,vastar resourc inc gari product high island la...,ham,spampy/datasets/enron\enron1\ham\0002.1999-12-...
2,calpin daili ga nomin calpin daili ga nomin doc,ham,spampy/datasets/enron\enron1\ham\0003.1999-12-...
3,issu fyi see note alreadi done stella forward ...,ham,spampy/datasets/enron\enron1\ham\0004.1999-12-...
4,meter nov alloc fyi forward lauri allen hou ec...,ham,spampy/datasets/enron\enron1\ham\0005.1999-12-...
...,...,...,...
33711,iso q good news c edaliss val edumm vl eoggra ...,spam,spampy/datasets/enron\enron6\spam\5995.2005-07...
33712,prescript medicin special precis put buck back...,spam,spampy/datasets/enron\enron6\spam\5997.2005-07...
33713,next gener onlin pharmaci readi rock let man r...,spam,spampy/datasets/enron\enron6\spam\5998.2005-07...
33714,bloow time time learn last time longer bed rea...,spam,spampy/datasets/enron\enron6\spam\5999.2005-07...


In [2]:
from sklearn.model_selection import train_test_split
messages['label'] = messages['label'].replace('ham', 0)
messages['label'] = messages['label'].replace('spam', 1)

spam = messages[messages.label == 1]
ham = messages[messages.label == 0]


from sklearn.feature_extraction.text import TfidfVectorizer

# Tf-idf for spam datasets
vect_spam = TfidfVectorizer()
tfidf_spam = vect_spam.fit_transform(spam['message'])
tfidf_matrix_spam = pd.DataFrame(tfidf_spam.toarray(), columns = vect_spam.get_feature_names())
header_spam = vect_spam.get_feature_names()


# Tf-idf for ham datasets
vect_ham = TfidfVectorizer()
tfidf_ham = vect_ham.fit_transform(ham['message'])
tfidf_matrix_ham = pd.DataFrame(tfidf_ham.toarray(), columns = vect_ham.get_feature_names())
header_ham = vect_ham.get_feature_names()

In [3]:
header_spam

['aa',
 'aaa',
 'aaaa',
 'aaaaci',
 'aaadrizzl',
 'aaagrp',
 'aaal',
 'aaaplusdirect',
 'aaasash',
 'aab',
 'aabvmmq',
 'aac',
 'aacm',
 'aacut',
 'aad',
 'aada',
 'aadaluzebcp',
 'aadvglu',
 'aae',
 'aaer',
 'aafco',
 'aag',
 'aaghpdvvu',
 'aagrra',
 'aaiab',
 'aaigrcrb',
 'aaihmqv',
 'aaiop',
 'aaiy',
 'aajpgnti',
 'aak',
 'aakkyl',
 'aakwoxd',
 'aal',
 'aalhsvnjh',
 'aaliyah',
 'aall',
 'aalman',
 'aalway',
 'aam',
 'aamaii',
 'aambiqu',
 'aamed',
 'aamlrg',
 'aamterdam',
 'aan',
 'aanboth',
 'aanbracht',
 'aangaand',
 'aangekondigd',
 'aanmt',
 'aannem',
 'aanopsm',
 'aanpak',
 'aanpakken',
 'aansluitkosten',
 'aanspraak',
 'aantal',
 'aanwervingen',
 'aanwezigheid',
 'aanzet',
 'aanzienlijk',
 'aao',
 'aaoeuro',
 'aaorwg',
 'aaqa',
 'aaqzufn',
 'aar',
 'aaraujo',
 'aarcdci',
 'aarhu',
 'aarn',
 'aaron',
 'aaronkwok',
 'aashish',
 'aashqcsni',
 'aastrom',
 'aasxc',
 'aat',
 'aatkswzvvqbu',
 'aattack',
 'aau',
 'aaumd',
 'aauyimht',
 'aavil',
 'aavlcqr',
 'aavz',
 'aawesom',
 'aawqw

In [4]:
ham_unique = list(set(header_ham).difference(set(header_spam)))
ham_unique

['orourk',
 'herber',
 'pwx',
 'nawet',
 'guhagar',
 'sundar',
 'rolodex',
 'saegerr',
 'aliant',
 'empower',
 'mvittal',
 'paulino',
 'centraga',
 'recommiss',
 'kevorkian',
 'umb',
 'volpon',
 'speacker',
 'chua',
 'subcommitte',
 'carmak',
 'taub',
 'coadgroup',
 'benzenecurv',
 'karush',
 'creditmetr',
 'mckeel',
 'mcmurtri',
 'bbva',
 'conneri',
 'unsupport',
 'deisgnat',
 'ryna',
 'deliev',
 'sholmeso',
 'bzu',
 'eari',
 'wsu',
 'pageid',
 'sanibel',
 'marryott',
 'opportunti',
 'boredboy',
 'przeslal',
 'sorkina',
 'vkaminski',
 'sharif',
 'canterra',
 'marvia',
 'mcfd',
 'moglabi',
 'canavan',
 'mvanrassel',
 'ravil',
 'mgolaboff',
 'redistrict',
 'khalid',
 'theocrat',
 'psoni',
 'otheriws',
 'cwt',
 'weichert',
 'ellingwood',
 'vermutlich',
 'launidadlatina',
 'dperlin',
 'pdvsa',
 'tassigni',
 'pici',
 'minolta',
 'rachana',
 'noncurr',
 'whitepap',
 'progasco',
 'magovern',
 'handphon',
 'fialka',
 'dscoppin',
 'diseconomi',
 'mccalist',
 'amdahl',
 'pira',
 'pauciti',
 'pe

In [14]:
spam_unique = list(set(header_spam).difference(set(header_ham)))
spam_unique

['nez',
 'rayner',
 'dolomitegirdl',
 'vopd',
 'ylaroacs',
 'fatiha',
 'msica',
 'fongistehif',
 'pnmm',
 'lesbiancrav',
 'readyyet',
 'dankt',
 'ipiar',
 'crabapplenielsen',
 'cvonyy',
 'gathermimet',
 'plzkrvuycybfflcwq',
 'smoothbor',
 'bluesea',
 'berdoyof',
 'clamberfisherman',
 'vrf',
 'instor',
 'chil',
 'ertsgmlpb',
 'gyn',
 'kyyj',
 'mjfhtrao',
 'muzzl',
 'skwqnn',
 'bewail',
 'entstehen',
 'calaluna',
 'paulag',
 'wwa',
 'stockspromot',
 'xle',
 'tithabl',
 'mywpvtyh',
 'iwdgqreo',
 'bk',
 'dealon',
 'eercis',
 'montblanc',
 'stra',
 'kindersex',
 'andpric',
 'keeplookingtil',
 'sincealberta',
 'jbwxanax',
 'boj',
 'fggffmtire',
 'starlotteri',
 'bilanguag',
 'erroripleas',
 'ksjdhrdqwuzx',
 'xdyrn',
 'isworldwid',
 'brunhild',
 'nr',
 'stevedor',
 'rememberthat',
 'pios',
 'willem',
 'sponger',
 'rapier',
 'acrobaat',
 'gqcuartante',
 'kfrgkrn',
 'meetyour',
 'szbpiv',
 'egujtc',
 'eworymlvr',
 'amywhereof',
 'lanzhou',
 'begriff',
 'jqnosba',
 'atento',
 'ngxu',
 'spheroid'

In [11]:
import csv
with open("top100features_var_7.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    top100_features = []
    for row in reader:
        top100_features.append(row[1])
top100_features = top100_features[1:]
top100_features

['remov',
 'http',
 'softwar',
 'money',
 'email',
 'onlin',
 'life',
 'medic',
 'best',
 'us',
 'free',
 'site',
 'within',
 'qualiti',
 'invest',
 'advertis',
 'process',
 'dollar',
 'bodi',
 'profession',
 'php',
 'low',
 'special',
 'hello',
 'mobil',
 'compani',
 'bank',
 'learn',
 'mail',
 'viagra',
 'technolog',
 'secur',
 'account',
 'spam',
 'med',
 'net',
 'prescript',
 'watch',
 'pain',
 'qualifi',
 'click',
 'cheap',
 'hi',
 'receiv',
 'hot',
 'legal',
 'pro',
 'found',
 'never',
 'unsubscrib',
 'sex',
 'claim',
 'get',
 'boy',
 'inform',
 'paliourg',
 'address',
 'drug',
 'health',
 'guarante',
 'becom',
 'futur',
 'ship',
 'sir',
 'hundr',
 'stock',
 'read',
 'ciali',
 'worldwid',
 'attent',
 'design',
 'produc',
 'success',
 'ave',
 'men',
 'thousand',
 'profil',
 'age',
 'ra',
 'solicit',
 'look',
 'de',
 'blond',
 'africa',
 'tri',
 'xp',
 'opt',
 'es',
 'immedi',
 'countri',
 'world',
 'repli',
 'messag',
 'info',
 'girl',
 'program',
 'pc',
 'huge',
 'biz',
 'fund',


In [59]:
# ABS result
import csv
with open("abs_result_columns.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    top100_features = []
    for row in reader:
        top100_features.append(row[1])
top100_features = top100_features[1:]
top100_features

['enron',
 'louis',
 'vinc',
 'thank',
 'attach',
 'dave',
 'doc',
 'question',
 'daren',
 'jim',
 'employe',
 'schedul',
 'edu',
 'eb',
 'houston',
 'monday',
 'energi',
 'password',
 'laci',
 'softwar',
 'congratul',
 'jeff',
 'bob',
 'risk',
 'listbot',
 'kaminski',
 'mike',
 'desk',
 'et',
 'salli',
 'tanya',
 'steve',
 'kevin',
 'supervisor',
 'amazon',
 'shirley',
 'friday',
 'pm',
 'ken',
 'deal',
 'outag',
 'cera',
 'fortun',
 'ga',
 'texa',
 'sitara',
 'counterparti',
 'bout',
 'ena',
 'gore',
 'chri',
 'octob',
 'april',
 'eol',
 'portland',
 'mark',
 'volum',
 'saturday',
 'blackberri',
 'capac',
 'san',
 'ect',
 'var',
 'keep',
 'jobsearch',
 'septemb',
 'know',
 'bid',
 'john',
 'chart',
 'enrononlin',
 'dailyupd',
 'duke',
 'comment',
 'juno',
 'jhherbert',
 'fundi',
 'tax',
 'contract',
 'access',
 'plan',
 'calger',
 'clickathom',
 'yesterday',
 'tonya',
 'tomorrow',
 'spinner',
 'maureen',
 'summari',
 'discuss',
 'solarc',
 'post',
 'http',
 'sent',
 'pleas',
 'destru

In [19]:
# variance: average = 0
import csv
with open("x2result.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    top100_features = []
    for row in reader:
        top100_features.append(row[1])
top100_features = top100_features[1:]
top100_features

['enron',
 'louis',
 'vinc',
 'thank',
 'attach',
 'dave',
 'doc',
 'question',
 'daren',
 'jim',
 'employe',
 'schedul',
 'eb',
 'edu',
 'houston',
 'monday',
 'energi',
 'password',
 'laci',
 'congratul',
 'jeff',
 'bob',
 'risk',
 'kaminski',
 'mike',
 'listbot',
 'desk',
 'et',
 'salli',
 'tanya',
 'steve',
 'kevin',
 'supervisor',
 'amazon',
 'shirley',
 'friday',
 'pm',
 'ken',
 'deal',
 'outag',
 'cera',
 'fortun',
 'ga',
 'texa',
 'sitara',
 'counterparti',
 'bout',
 'ena',
 'gore',
 'chri',
 'octob',
 'april',
 'eol',
 'portland',
 'volum',
 'mark',
 'saturday',
 'blackberri',
 'capac',
 'san',
 'ect',
 'keep',
 'var',
 'know',
 'jobsearch',
 'septemb',
 'bid',
 'john',
 'dailyupd',
 'enrononlin',
 'chart',
 'duke',
 'comment',
 'fundi',
 'jhherbert',
 'juno',
 'tax',
 'plan',
 'calger',
 'contract',
 'access',
 'clickathom',
 'tomorrow',
 'yesterday',
 'tonya',
 'spinner',
 'maureen',
 'discuss',
 'summari',
 'sent',
 'solarc',
 'post',
 'pleas',
 'destruct',
 'everyon',
 'ht

In [20]:
# ori > 0, ad = 0
# in spam, not in ad
import csv
with open("ori1_ad0_columns.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    ad_diff = []
    for row in reader:
        ad_diff.append(row[1])
ori1_ad0_diff = ad_diff[1:]
ori1_ad0_diff

['abacha',
 'abe',
 'abei',
 'abel',
 'abhorr',
 'abiiiti',
 'abl',
 'aboard',
 'abraham',
 'abroad',
 'absolut',
 'abund',
 'abundancegroup',
 'ac',
 'acceler',
 'account',
 'accredit',
 'accur',
 'achiev',
 'aci',
 'acid',
 'acquir',
 'acquisit',
 'act',
 'action',
 'actividad',
 'adam',
 'add',
 'addict',
 'additiona',
 'additionai',
 'additionaili',
 'address',
 'ade',
 'administr',
 'admir',
 'adreport',
 'adult',
 'adv',
 'advanc',
 'adversari',
 'advert',
 'advertis',
 'advic',
 'aeopublish',
 'aerobact',
 'afdt',
 'affiiiat',
 'affili',
 'afford',
 'aflam',
 'ag',
 'aggress',
 'agnomen',
 'ago',
 'agreeabl',
 'ahead',
 'ahm',
 'ailow',
 'airfram',
 'ake',
 'al',
 'alabama',
 'alain',
 'alfonso',
 'algaecid',
 'alguma',
 'aliianc',
 'alkan',
 'alleyway',
 'allow',
 'alluvium',
 'alm',
 'almuerzo',
 'amant',
 'ambrosi',
 'ambul',
 'amino',
 'among',
 'anatol',
 'anax',
 'anchor',
 'ancient',
 'andmanyoth',
 'angei',
 'angier',
 'ani',
 'anic',
 'ano',
 'anonym',
 'anoth',
 'anthr

In [21]:
len(ori1_ad0_diff)

1854

In [22]:
# ori = 0, ad > 0
# not in spam ,in ad
with open("ori0_ad1_columns.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    ori0_ad1_diff = []
    for row in reader:
        ori0_ad1_diff.append(row[1])
ori0_ad1_diff = ori0_ad1_diff[1:]
ori0_ad1_diff

['aaa',
 'aaldou',
 'abacha',
 'abacu',
 'abacustech',
 'abandon',
 'abb',
 'abbamont',
 'abbott',
 'abdo',
 'abdullah',
 'abe',
 'abei',
 'abel',
 'abhorr',
 'abi',
 'abid',
 'abigail',
 'abiiiti',
 'abil',
 'abklaeren',
 'abl',
 'abnorm',
 'aboard',
 'abort',
 'abovenet',
 'abr',
 'abraham',
 'abramowicz',
 'abreo',
 'abridg',
 'abroad',
 'absa',
 'absenc',
 'absolut',
 'absorbowac',
 'absout',
 'abund',
 'abundancegroup',
 'abundantli',
 'abx',
 'ac',
 'academi',
 'academia',
 'acceler',
 'accentur',
 'accept',
 'access',
 'accid',
 'acco',
 'accolad',
 'accompani',
 'accompish',
 'accomplish',
 'accordingli',
 'account',
 'accountonlin',
 'accredit',
 'accret',
 'accru',
 'accrual',
 'acct',
 'acctg',
 'accum',
 'accur',
 'accuraci',
 'acf',
 'achiev',
 'aci',
 'acid',
 'ack',
 'aco',
 'acock',
 'acorss',
 'acostello',
 'acquir',
 'acquisit',
 'acrobat',
 'across',
 'act',
 'action',
 'activ',
 'actividad',
 'activist',
 'acton',
 'actual',
 'acura',
 'acxiom',
 'ad',
 'adam',
 'ad

In [23]:
len(ori0_ad1_diff)

11361

In [24]:
# in ham & top500
ham_unique_in_top = list(set(ham_unique).intersection(set(top100_features)))
print(len(ham_unique_in_top))
ham_unique_in_top

95


['dailyupd',
 'cnt',
 'jlokay',
 'electronical',
 'webtext',
 'sempra',
 'wassup',
 'newyorkf',
 'lokay',
 'caiso',
 'wapa',
 'rmaghom',
 'hitschel',
 'tufco',
 'jobsearch',
 'tenaska',
 'geir',
 'noram',
 'gtc',
 'borsheim',
 'aggi',
 'meoh',
 'jobsonlin',
 'cof',
 'listbot',
 'enrononlin',
 'sitara',
 'cera',
 'kaminski',
 'arnaez',
 'cpuc',
 'valpak',
 'cplex',
 'brm',
 'reactionsnet',
 'miktex',
 'equistar',
 'moneycentr',
 'calpin',
 'cornhusk',
 'neon',
 'garp',
 'aep',
 'massion',
 'pec',
 'pba',
 'dbcap',
 'vasant',
 'exl',
 'zimin',
 'mmbtu',
 'ferc',
 'thinkbank',
 'txu',
 'dfarmer',
 'eastran',
 'womencentr',
 'gdol',
 'commoditylog',
 'hpuent',
 'fundi',
 'epelectr',
 'ballmer',
 'reclassif',
 'anthrax',
 'elana',
 'staceykn',
 'vshanbh',
 'globaltrad',
 'psolp',
 'inderp',
 'dilbert',
 'ishihara',
 'ena',
 'beenladen',
 'eyeforenergi',
 'wassupsouthparkl',
 'gpg',
 'erisk',
 'topica',
 'maillet',
 'tnpc',
 'dwayn',
 'pjm',
 'counterparti',
 'solarc',
 'lama',
 'calger',
 '

In [25]:
# ori > 1, ad = 0 & top500
ori1_ad0_in_top = list(set(ori1_ad0_diff).intersection(set(top100_features)))
print(len(ori1_ad0_in_top))
ori1_ad0_in_top

2


['remov', 'http']

In [26]:
# ori = 0, ad > 0 & in top500
ori0_ad1_in_top = list(set(ori0_ad1_diff).intersection(set(top100_features)))
print(len(ori0_ad1_in_top))
ori0_ad1_in_top

500


['bit',
 'dailyupd',
 'spoke',
 'newyorkf',
 'alloc',
 'outta',
 'delano',
 'post',
 'vac',
 'lindholm',
 'stinson',
 'spec',
 'deal',
 'john',
 'tran',
 'tomorrow',
 'shirley',
 'geir',
 'prior',
 'tax',
 'respons',
 'trade',
 'move',
 'cof',
 'listbot',
 'ga',
 'jake',
 'allegheni',
 'initi',
 'asset',
 'union',
 'number',
 'draft',
 'park',
 'agreement',
 'spinner',
 'cplex',
 'cell',
 'dth',
 'texaco',
 'would',
 'yore',
 'kevin',
 'research',
 'close',
 'forward',
 'barrington',
 'calpin',
 'tw',
 'game',
 'weather',
 'garp',
 'mike',
 'massion',
 'yahoo',
 'pec',
 'copier',
 'bill',
 'hem',
 'desktop',
 'mmbtu',
 'volatil',
 'access',
 'eastran',
 'birthday',
 'overdraft',
 'supervisor',
 'epelectr',
 'sheet',
 'anthrax',
 'tanya',
 'etv',
 'weigh',
 'intercept',
 'point',
 'juli',
 'london',
 'mw',
 'hotmail',
 'ishihara',
 'swap',
 'el',
 'tammi',
 'guadalup',
 'wassupsouthparkl',
 'chri',
 'robert',
 'eol',
 'afteral',
 'fyi',
 'pleas',
 'pictur',
 'pjm',
 'deadlin',
 'desk',


In [27]:
# ori > 0, ad = 0 & in spam
ori1_ad0_in_spam = list(set(ori1_ad0_in_top).intersection(set(spam_unique)))
print(len(ori1_ad0_in_spam))
print(len(ori1_ad0_in_top), len(spam_unique))
ori1_ad0_in_spam

0
2 80643


[]

In [28]:
# ori > 0, ad = 0 & in ham
ori1_ad0_in_ham = list(set(ori1_ad0_in_top).intersection(set(ham_unique)))
print(len(ori1_ad0_in_ham))
print(len(ori1_ad0_in_ham), len(ham_unique))
ori1_ad0_in_ham

0
0 20471


[]

In [29]:
ori0_ad1_in_ham = list(set(ori0_ad1_in_top).intersection(set(ham_unique)))
print(len(ori0_ad1_in_ham))
print(len(ori0_ad1_in_ham), len(ham_unique))
ori0_ad1_in_ham

95
95 20471


['dailyupd',
 'cnt',
 'jlokay',
 'electronical',
 'webtext',
 'sempra',
 'wassup',
 'newyorkf',
 'lokay',
 'caiso',
 'wapa',
 'rmaghom',
 'hitschel',
 'tufco',
 'jobsearch',
 'tenaska',
 'geir',
 'noram',
 'gtc',
 'borsheim',
 'aggi',
 'meoh',
 'jobsonlin',
 'cof',
 'listbot',
 'enrononlin',
 'sitara',
 'cera',
 'kaminski',
 'arnaez',
 'cpuc',
 'valpak',
 'cplex',
 'brm',
 'reactionsnet',
 'miktex',
 'equistar',
 'moneycentr',
 'calpin',
 'cornhusk',
 'neon',
 'garp',
 'aep',
 'massion',
 'pec',
 'pba',
 'dbcap',
 'vasant',
 'exl',
 'zimin',
 'mmbtu',
 'ferc',
 'thinkbank',
 'txu',
 'dfarmer',
 'eastran',
 'womencentr',
 'gdol',
 'commoditylog',
 'hpuent',
 'fundi',
 'epelectr',
 'ballmer',
 'reclassif',
 'anthrax',
 'elana',
 'staceykn',
 'vshanbh',
 'globaltrad',
 'psolp',
 'inderp',
 'dilbert',
 'ishihara',
 'ena',
 'beenladen',
 'eyeforenergi',
 'wassupsouthparkl',
 'gpg',
 'erisk',
 'topica',
 'maillet',
 'tnpc',
 'dwayn',
 'pjm',
 'counterparti',
 'solarc',
 'lama',
 'calger',
 '

In [31]:
ori0_ad1_in_spam = list(set(ori0_ad1_in_top).intersection(set(spam_unique)))
print(len(ori0_ad1_in_spam))
print(len(ori0_ad1_in_spam), len(spam_unique))
ori0_ad1_in_spam

0
0 80643


[]