In [30]:
import pandas as pd
import pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
import pyspark.sql.types as t
import pyspark.sql.functions as f
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, NGram, Normalizer
from pyspark.ml.linalg import *
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

import string as st

In [2]:
sc = SparkSession.builder.getOrCreate()

In [3]:
emails = sc.sparkContext.textFile('crawl2')

In [4]:
emails.toDF("String").count()

2011

In [5]:
email_list = pd.read_pickle("email_list")

In [6]:
type(email_list)

list

In [7]:
label = np.random.randint(1,5, size=len(email_list))

In [8]:
df = pd.DataFrame(zip(emails.collect(),label), columns = ['email','label'])
names = df['email'].sample(frac = 1, replace = True).reset_index(drop=True)
df['names']=names

In [9]:
df

Unnamed: 0,email,label,names
0,sherri.sera@enron.com,3,ccrawfo@azurix.com
1,trish3kids@aol.com,3,dfeather@enron.com
2,Trish3kids@aol.com,4,john.sherriff@enron.com
3,sherri.sera@enron.com,4,amelia.alder@enron.com
4,greg.grissom@enron.com,2,cuus@enron.com
...,...,...,...
2006,michael.brown@enron.com,4,mwarner@enron.com
2007,lucy.marshall@enron.com,3,sherri.sera@enron.com
2008,cindy.olson@enron.com,1,larry.ciscon@enron.com
2009,bridget.maronge@enron.com,4,mgarber2@enron.com


In [10]:
email_df = sc.createDataFrame(df, schema = ['email','label','names'] )

In [11]:
# check number of symbols/ngrams in set of strings
vectorizer = CountVectorizer(analyzer='char', lowercase=False, ngram_range=(1,1))
vectorizer = vectorizer.fit(email_list)
symbols = vectorizer.get_feature_names()
len(symbols)

74

In [31]:
def vectorize_string_cols(df, cols):
    
    other_cols = list(set(df.columns) - set(cols)) 
    for col in cols:
        
        tokenizer = RegexTokenizer(inputCol=f"{col}", outputCol=f"{col}_chars", gaps=False, pattern="." )
        charData = tokenizer.transform(df)
        
        # the n-gram step is set to 1 right now so it does not currently do anything,
        # it is there for conveinience if we want n-grams in the future
        ngramer = NGram(n=1, inputCol=f"{col}_chars", outputCol=f"{col}_ngrams")
        ngramData = ngramer.transform(charData)
        
        
        # this uses a hash function instead of the standard count vectorization
        # in general, if the load factor (num_symbols/num_features) is very small
        # the expected number of hash collisions is small, and so the hashing tf 
        # should be very similar to the countVectorizer.
        
        # as load factor increases, the number of symbols hashed to the same number
        # will increase, but we could possibly use this to increase performance 
        # if the number of features gets too large.
        
        # the hashing algorithm works best when features is a power of 2
        
        hashingTF = HashingTF(inputCol=f"{col}_ngrams", outputCol=f"{col}_rawFeatures", numFeatures=2**7)
        featurizedData = hashingTF.transform(ngramData)
        
        # not 100% sure about this, but it was in the documentation. it might help
        # with the fact that idfModel makes two passes over the data.
        featurizedData.cache()
        
        idf = IDF(inputCol=f"{col}_rawFeatures", outputCol=f"{col}_vec")
        idfModel = idf.fit(featurizedData)
        df = idfModel.transform(featurizedData)
    
    new_columns =[f"{col}_vec" for col in cols]
    all_cols = cols+other_cols+new_columns
    vectorized = df.select(all_cols)
    
    return vectorized

def normalize_columns(df, cols):
    
    other_cols = list(set(df.columns) - set(cols)) 
    for col in cols:
        normalizer = Normalizer(inputCol=f"{col}", outputCol=f"{col}_normalized", p=2.0)
        df = normalizer.transform(df)
        
    
    return df

In [32]:
vectorized = vectorize_string_cols(email_df, ['email','names'])
vectorized = normalize_columns(vectorized, ['email_vec','names_vec'])
vectorized.show()

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|               email|               names|label|           email_vec|           names_vec|email_vec_normalized|names_vec_normalized|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|sherri.sera@enron...|  ccrawfo@azurix.com|    3|(128,[0,7,26,66,6...|(128,[0,26,34,66,...|(128,[0,7,26,66,6...|(128,[0,26,34,66,...|
|  trish3kids@aol.com|  dfeather@enron.com|    3|(128,[0,6,26,54,6...|(128,[0,6,7,26,66...|(128,[0,6,26,54,6...|(128,[0,6,7,26,66...|
|  Trish3kids@aol.com|john.sherriff@enr...|    4|(128,[0,6,26,54,6...|(128,[0,7,26,66,6...|(128,[0,6,26,54,6...|(128,[0,7,26,66,6...|
|sherri.sera@enron...|amelia.alder@enro...|    4|(128,[0,7,26,66,6...|(128,[0,6,7,26,54...|(128,[0,7,26,66,6...|(128,[0,6,7,26,54...|
|greg.grissom@enro...|      cuus@enron.com|    2|(128,[0,7,26,

In [14]:
first = vectorized.first()


In [15]:
@f.udf(t.FloatType())
def vect_cosine(col):
    num_rows = len(col)
    num_entries = (num_rows * (num_rows -1))/2
    if num_rows >= 10:
        
        cosines = cosine_similarity(col)
        cosines = np.triu(cosines, k=1)
        
        return float(np.sum(cosines))
    
    else:
        return None

In [35]:
result = vectorized.groupBy("label").agg(
    vect_cosine(f.collect_list("email_vec_normalized")),
    vect_cosine(f.collect_list("email_vec"))
)

In [36]:
result.show()

+-----+-----------------------------------------------------+------------------------------------------+
|label|vect_cosine(collect_list(email_vec_normalized, 0, 0))|vect_cosine(collect_list(email_vec, 0, 0))|
+-----+-----------------------------------------------------+------------------------------------------+
|    1|                                            29587.445|                                 29587.445|
|    3|                                             33778.53|                                  33778.53|
|    2|                                             25164.93|                                  25164.93|
|    4|                                            27797.639|                                 27797.639|
+-----+-----------------------------------------------------+------------------------------------------+



In [37]:
result2 = vectorized.agg(
    vect_cosine(f.collect_list("email_vec_normalized"))
)

In [38]:
result2.show()

+-----------------------------------------------------+
|vect_cosine(collect_list(email_vec_normalized, 0, 0))|
+-----------------------------------------------------+
|                                             464472.0|
+-----------------------------------------------------+



In [49]:
@f.udf(t.FloatType())
def vect_cosine2(col):
    num_rows = len(col)
    
    if num_rows >= 10:
        
        num_entries = (num_rows * (num_rows -1))
        
        colsum = np.sum(col, axis=0)
        sqsum = np.sum(np.square(colsum))
        
        return float((sqsum-num_rows)/num_entries)
    
    else:
        return None

In [50]:
result2 = vectorized.agg(
    vect_cosine2(f.collect_list("email_vec_normalized"))
)

In [51]:
result2.show()

+------------------------------------------------------+
|vect_cosine2(collect_list(email_vec_normalized, 0, 0))|
+------------------------------------------------------+
|                                             0.2298166|
+------------------------------------------------------+

