# email

In [10]:
import re
import pandas as pd
import glob
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string as st
import random as ra

In [11]:
txt_files = glob.glob("data/*")

In [12]:
def find_emails(file_name):
    
    with open(file_name, 'rt') as message:
        file = message.read()
        
    file = file.replace("\n", " ")
    exp = "[\w\.\+\-\!\?\*#$%&'=]+@[\w]+\.+[\w-]+"
    emails = re.findall(exp, file)

    return emails



In [13]:
emails = map(find_emails, txt_files)
flattened = set(chain.from_iterable(emails))
email_list = list(set(map(lambda x: x.strip("'"), flattened)))

# Simple character vectorization of emails

In [14]:
vectorizer = CountVectorizer(analyzer='char', lowercase=False)
vectorizer = vectorizer.fit(email_list)
email_vect = vectorizer.transform(email_list)

In [15]:
email_vect

<5358x74 sparse matrix of type '<class 'numpy.int64'>'
	with 78701 stored elements in Compressed Sparse Row format>

In [16]:
vectorizer.get_feature_names();

In [17]:
sims = cosine_similarity(email_vect)
n_dim, m_dim = sims.shape
sims= np.triu(sims, k=1)

# finding most similar pairs to check

In [18]:
top5 = np.argpartition(sims, -5, axis=None)[-5:]
n_dim, m_dim = sims.shape
top5_indicesnp_1 = [int(x/m_dim) for x in top5]
top5_indicesnp_2 = [x%m_dim for x in top5]
top5_indicesnp = (tuple(top5_indicesnp_1),tuple(top5_indicesnp_2))
top5_indices = list(map((lambda x : (int(x/m_dim), x%m_dim) ), (top5)))


In [19]:
top5_indices

[(330, 3093), (257, 2657), (2093, 4918), (40, 3013), (761, 2111)]

In [20]:
sims[top5_indicesnp]

array([1., 1., 1., 1., 1.])

In [21]:
email_list[688],email_list[4344]

('robert.scheuer@enron.com', 'editor@petroleumworld.com')

# other ideas

cosine similarity is good, the trick is encoding the emails as vectors in a way that makes sense.
there are two different things to look at:

- changing tokens to something like uppercase, lowercase, symbol, number or even using dictionaries of names or words to do word, name, symbol, number

- adding in some dependence on order; right now anagrams will have similarity of 1. could change tokens to ngrams or directional transitions, ie a->b, a->c, etc. would be the vector space. 

the next goal is to do something combining these- where we look at uppercase->lowercase->symbol etc. transitions

# New dictionary

In [22]:
vectorizer = CountVectorizer(analyzer='char', lowercase=False)
vectorizer = vectorizer.fit(email_list)

In [23]:
symbols = vectorizer.get_feature_names()

In [60]:
vocab2 = {}
for char in symbols:
    if char in st.ascii_lowercase:
        vocab2[char]=0
    elif char in st.ascii_uppercase:
        vocab2[char]=1
    elif char in st.digits:
        vocab2[char]=2
    else:
        vocab2[char]=3
vocab2['"'] = 4

In [64]:
vocab2;

In [26]:
def token_mapper(list_of_string, dictionary):
    str1 = [''.join(str(e) for e in list(map(dictionary.get, string))) for string in list_of_string] 
    return str1

In [27]:
mapped = token_mapper(email_list, vocab2)

In [65]:
mapped;

In [28]:
# vectorizer2=CountVectorizer(analyzer='char')
# vectors2 = vectorizer2.fit_transform(mapped)

In [29]:
#try with ngrams
vectorizer2=CountVectorizer(analyzer='char', ngram_range=(2,3))
vectors2 = vectorizer2.fit_transform(mapped)

In [30]:
vectors2.shape

(5358, 78)

In [31]:
sims = cosine_similarity(vectors2)
n_dim, m_dim = sims.shape
sims= np.triu(sims, k=1)

In [32]:
sims

array([[0.        , 0.99976319, 0.99721515, ..., 0.98826355, 0.99553586,
        0.99892149],
       [0.        , 0.        , 0.99860194, ..., 0.98470955, 0.99325843,
        0.99969535],
       [0.        , 0.        , 0.        , ..., 0.97413359, 0.98576644,
        0.99960241],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.99824508,
        0.98011407],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.99010594],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [33]:
top5 = np.argpartition(sims, -5, axis=None)[-5:]
n_dim, m_dim = sims.shape
top5_indicesnp_1 = [int(x/m_dim) for x in top5]
top5_indicesnp_2 = [x%m_dim for x in top5]
top5_indicesnp = (tuple(top5_indicesnp_1),tuple(top5_indicesnp_2))
top5_indices = list(map((lambda x : (int(x/m_dim), x%m_dim) ), (top5)))


In [34]:
top5_indices

[(285, 5158), (285, 4947), (3421, 5158), (4947, 5158), (285, 3421)]

In [35]:
sims[top5_indicesnp]

array([1., 1., 1., 1., 1.])

In [37]:
email_list[285],email_list[4947]

('IMCEANOTES-Lou+20Moore+20+3Clmoore+40houstontech+2Eorg+3E+40ENRON@ENRON.com',
 'IMCEANOTES-Carol+20Wallen+20+3Ccwallen+40hilcorp+2Ecom+3E+40ENRON@ENRON.com')

# moving on to words and names
 this method seems to be showing promise on this testing data, we will need to do more in-depth checking and testing to gauge performance
 the main problem now is that the measurements seem to be pretty affected by the length of the email address. using dictionaries and/or lists of names will help with this.
 

In [38]:
with open("NamesAll.dic", 'rt', encoding='cp1252') as message:
        file = message.read()
file = file.replace("\n", ",").replace(" ","")
names = file.split(sep=',')
names = list(filter(lambda x: len(x)>3, names))

In [144]:
#test_add = "abashir22@gmail.com"

In [39]:
def name_replace(string, dict):
    for word in dict:
        string2 = string.replace(word, '"')
        if string2 != string:
            return string2
        
    return string

In [40]:
ra.shuffle(names)

In [42]:
#name_replace(test_add, names)

In [43]:
mapped1 = [name_replace(email, names) for email in email_list]

In [67]:
mapped1;

In [45]:
mapped2 = token_mapper(mapped1, vocab2)

In [69]:
mapped2;

In [118]:
#try with ngrams
vectorizer2=CountVectorizer(analyzer='char', ngram_range=(5,5))
vectorizer2 = vectorizer2.fit(mapped2)
vectors2 = vectorizer2.transform(mapped2)

In [119]:
vectors2.shape

(5358, 698)

In [120]:
cols = list(zip(vectorizer2.get_feature_names(), list(range(309))))

In [121]:
nearby_names = list(filter(lambda x : "4" in x[0], cols))
nn_indices = list(map(lambda x : x[1], nearby_names))

In [122]:
vectors3 = vectors2[:,nn_indices]
vectors3.shape

(5358, 125)

In [123]:
sims = cosine_similarity(vectors3)
n_dim, m_dim = sims.shape
sims= np.triu(sims, k=1)

In [124]:
top5 = np.argpartition(sims, -8, axis=None)[-8:]
n_dim, m_dim = sims.shape
top5_indicesnp_1 = [int(x/m_dim) for x in top5]
top5_indicesnp_2 = [x%m_dim for x in top5]
top5_indicesnp = (tuple(top5_indicesnp_1),tuple(top5_indicesnp_2))
top5_indices = list(map((lambda x : (int(x/m_dim), x%m_dim) ), (top5)))


In [125]:
top5_indices

[(2958, 4526),
 (2958, 4531),
 (2958, 4522),
 (2958, 4535),
 (2958, 4568),
 (2958, 4561),
 (2958, 4567),
 (2300, 2785)]

In [103]:
sims[top5_indicesnp]

array([1., 1., 1., 1., 1., 1., 1., 1.])

In [129]:
email_list[2958],email_list[4526]

('wes.dempsey@enron.com', 'ken.rice@enron.com')

This is quite crude at this point, but the dictionary approach seems to be doing what we wanted it to do. 
the dictionary search through the email addressess is very inefficient, but on the irs data we could just use any names on the tax return as a dictionary, which would probably be easy. we would run into some rich, richard, dick matching problems, but for a crude first pass it would be interesting