# Imports

In [1]:
import os


from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession

In [2]:
app_name="Case Study 2: Email Analytics"


conf = SparkConf().setAppName(app_name)
conf = (conf.setMaster('local[*]')
        .set("spark.driver.host", "localhost")
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '8G')
        .set('spark.driver.maxResultSize', '10G'))
sc = SparkContext(conf=conf)


spark = SparkSession(sc)

In [3]:
sc.applicationId

'local-1572747584512'

In [4]:
log4jLogger = sc._jvm.org.apache.log4j
LOGGER = log4jLogger.LogManager.getLogger(__name__)
LOGGER.info("pyspark script logger initialized")

In [5]:
import pandas as pd
def set_pandas_options() -> None:
    pd.options.display.max_columns = 100
    pd.options.display.max_rows = 100
    pd.options.display.max_colwidth = 120
    pd.options.display.width = 140
    
set_pandas_options()

# 1 Load data into Spark DataFrame

In [6]:
def get_hdfs_filepath(file_name, on_cloud=False):
    # path to folder containing this code
    prefix = '/data/spark/8_cs2_dataset/'
    if on_cloud:
        bucket  = os.environ['BUCKET']
        file_path = bucket + prefix + file_name
    else:
        file_path = '/Users/val' + prefix + file_name

    return file_path

In [7]:
LOG = get_hdfs_filepath('*/*/*')


In [8]:
print(LOG)

/Users/val/data/spark/8_cs2_dataset/*/*/*


In [10]:

log_txt_df=sc.wholeTextFiles(LOG).filter(lambda line: line != '').toDF()

## Convert strings to columns

In [11]:
import datetime as dt
import dateutil.parser
import pytz

def to_utc_timestamp(string):
    zone_idx = string.find('-') if string.find('-') >= 0 else string.find('+')
    zone_abbr = string[string.find('(')-1:string.find(')')] # not used
    timezone_str = string[zone_idx:string.find('(')-1]
    date_time_str = string[:zone_idx-1]
    date_time_obj = dt.datetime.strptime(date_time_str, '%d %b %Y %H:%M:%S')
    dt2 = dateutil.parser.parse(str(date_time_obj)+timezone_str)
    return dt2.astimezone(pytz.timezone("UTC"))

converted = to_utc_timestamp('12 Dec 2015 18:25:11 -0700 (PDT)')

In [12]:
converted

datetime.datetime(2015, 12, 13, 1, 25, 11, tzinfo=<UTC>)

In [13]:
import pyspark
df = log_txt_df
#df.line =  df.line.cast(StringType)


In [14]:
df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [15]:
df = df.select(df._2.alias('line') )

In [16]:
df.limit(1).toPandas()

Unnamed: 0,line
0,"Message-ID: <9838605.1075853079790.JavaMail.evans@thyme>\r\nDate: Thu, 23 Aug 2001 09:24:44 -0700 (PDT)\r\nFrom: lyn..."


In [17]:
from pyspark.sql.functions import regexp_extract, col
from pyspark.sql.types import StringType, IntegerType, TimestampType
from pyspark.sql.functions import udf, expr, substring, expr, regexp_replace, count


udf1 = udf(to_utc_timestamp, TimestampType())

temp = df.select(
    regexp_extract(col('line'), r'Message-ID:\s<.*>',0).alias('Message_ID'),
    regexp_extract(col('line'), r'\d{1,2}\s\w{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s(\+|\-)\d{4}(.*)', 0).alias("Date"),
    regexp_extract(col('line'), r'From:\s(.*)', 0).alias("From"),
    regexp_extract(col('line'), r"To:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(\S+@\S+)(?:\n|\r\n?)Subject:\s", 0).alias("To"),
    regexp_extract(col('line'), r"Subject:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}", 1).alias("Subject"),
    regexp_extract(col('line'), r"Cc:\s(.+)((?:\n|\r\n?)((?:(?:\n|\r\n?).+)+)){0,}(?:\n|\r\n?)Mime-Version:\s", 0).alias("Cc"),
    regexp_extract(col('line'), r'Mime-Version:\s(.+)', 1).alias("Mime_Version"),
    regexp_extract(col('line'), r'Content-Type:\s(.*)', 1).alias("Content_Type"),
    regexp_extract(col('line'), r"Content-Transfer-Encoding:\s(.+)", 1).alias("Content_Transfer_Encoding"),
    regexp_extract(col('line'), r"X-From:\s(.*)(?:\n|\r\n?)X-To:\s", 0).alias("X_From"),
    regexp_extract(col('line'), r'X-To:\s(.*)(?:\n|\r\n?)X-cc:\s', 0).alias("X_To"),
    regexp_extract(col('line'), r'X-cc:\s(.*)(?:\n|\r\n?)X-bcc:\s', 0).alias("X_cc"),
    regexp_extract(col('line'), r'X-bcc:\s(.*)(?:\n|\r\n?)X-Folder:\s', 0).alias("X_bcc"),
    regexp_extract(col('line'), r'X-Folder:\s(.*)(?:\n|\r\n?)X-Origin:\s', 0).alias("X_Folder"),
    regexp_extract(col('line'), r"X-Origin:\s(.*)(?:\n|\r\n?)X-FileName:\s", 0).alias("X_Origin"),
    regexp_extract(col('line'), r"X-FileName:\s(.*)", 0).alias("X_FileName"),
    regexp_extract(col('line'), r"X-FileName:\s(.*)((?:\n|\r\n?){1,}(.*)){1,}((?:(?:\n|\r\n?).+)+)", 0).alias("FYI")
)
#temp.cache()
temp1 = temp.select(
    expr("substring(Message_ID, 14, length(Message_ID)-14)").alias("Message_ID"),
    'Date', 
    udf1('Date').alias('UTC_timestamp'),
    expr("substring(From, 7, length(From)-6)").alias("From"),
    expr("substring(To, 5, length(To)-15)").alias("To"),
    "Subject",
    expr("substring(Cc, 5, length(Cc)-20)").alias("Cc"),
    "Mime_Version",
    "Content_Type",
    'Content_Transfer_Encoding',
    expr("substring(X_From, 9, length(X_From)-16)").alias("X_From"),
    expr("substring(X_To, 7, length(X_To)-14)").alias("X_To"),
    expr("substring(X_cc, 7, length(X_cc)-15)").alias("X_cc"),
    expr("substring(X_bcc, 8, length(X_bcc)-19)").alias("X_bcc"),
    expr("substring(X_Folder, 11, length(X_Folder)-22)").alias("X_Folder"),
    expr("substring(X_Origin, 11, length(X_Origin)-24)").alias("X_Origin"),
    expr("substring(X_FileName, 13, length(X_FileName)-15)").alias("X_FileName"),
    regexp_replace(col('FYI'), r"(X-FileName:\s(.*)(?:\n|\r\n?){1,})|(-*Original Message-*(.*)((?:\n|\r\n?){1,}(.*)){0,}((?:(?:\n|\r\n?).+)+))", '').alias('FYI')
)
#temp1.cache()
result = temp1.select(
    "Message_ID",
    'Date', 
    'UTC_timestamp',
    "From",
    regexp_replace(col('To'), r"\r\n\t", "").alias("To"),
    "Subject",
    regexp_replace(col('Cc'), r"\r\n\t", "").alias("Cc"),
    "Mime_Version",
    "Content_Type",
    'Content_Transfer_Encoding',
    "X_From",
    "X_To",
    "X_cc",
    "X_bcc",
    "X_Folder",
    "X_Origin",
    "X_FileName",
    regexp_replace(col('FYI'), r"(^\s{1,})|(\n{2,})", '').alias('FYI')
)

#result.cache()

result.limit(5).toPandas()

Unnamed: 0,Message_ID,Date,UTC_timestamp,From,To,Subject,Cc,Mime_Version,Content_Type,Content_Transfer_Encoding,X_From,X_To,X_cc,X_bcc,X_Folder,X_Origin,X_FileName,FYI
0,9838605.1075853079790.JavaMail.evans@thyme,23 Aug 2001 09:24:44 -0700 (PDT),2001-08-23 12:24:44,lynn.blair@enron.com,shelley.corman@enron.com,FW: PAA,lynn.blair@enron.com,1.0,text/plain; charset=us-ascii,7bit,"Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=LBLAIR>","Corman, Shelley </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Scorman>","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Lblair>",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Outbox",Blair-L,LBLAIR (Non-Privileged).,"Shelley, Larry Berger has put together the 4 PAA's based on these employeef efforts on the testing of the new Flowin..."
1,14271690.1075853083553.JavaMail.evans@thyme,11 Jun 2001 14:06:00 -0700 (PDT),2001-06-11 17:06:00,shelley.corman@enron.com,"steve.hotte@enron.com, steven.january@enron.com, lynn.blair@enron.com, mike.bryant@enron.com",Follow-up on Weekend,gina.taylor@enron.com,1.0,text/plain; charset=us-ascii,7bit,Shelley Corman <Shelley Corman/ENRON@enronXgate>,"Steve Hotte <Steve Hotte/ENRON@enronXgate>, Steven January <Steven January/ET&S/Enron@ENRON>, Lynn Blair <Lynn Blair...",Gina Taylor <Gina Taylor/ENRON@enronXgate>,,"\LBLAIR (Non-Privileged)\Blair, Lynn\Tropical Storm Allison",Blair-L,LBLAIR (Non-Privileged).,I propose that we put in place the following plan to follow-up with employees:1. Thank you from Stan to all employee...
2,18165833.1075853083239.JavaMail.evans@thyme,26 Sep 2001 06:11:50 -0700 (PDT),2001-09-26 09:11:50,gary.kenagy@enron.com,"ricki.winters@enron.com, console.security@enron.com",RE: Security Access,"lynn.blair@enron.com, rick.dietz@enron.com, sheila.nacey@enron.com, bradley.holmes@enron.com, steven.january@enron.c...",1.0,text/plain; charset=us-ascii,7bit,"Kenagy, Gary </O=ENRON/OU=NA/CN=RECIPIENTS/CN=GKENAGY>","Winters, Ricki </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Rwinter>, Security Console, </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Notesad...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Lblair>, Dietz, Rick </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Rdietz>, Nacey, S...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Move",Blair-L,LBLAIR (Non-Privileged).,"Security, Everyone on the attached list will require after hours access to the common areas on EB39 and EB42. Please..."
3,5982727.1075853083190.JavaMail.evans@thyme,20 Sep 2001 14:41:21 -0700 (PDT),2001-09-20 17:41:21,audrey.robertson@enron.com,"dennis.alters@enron.com, ben.asante@enron.com, ramona.betancourt@enron.com, lynn.blair@enron.com, bob.burleson@enron...",FW: New Location for Steve Harris' Staff Meeting,,1.0,text/plain; charset=us-ascii,7bit,"Robertson, Audrey </O=ENRON/OU=NA/CN=RECIPIENTS/CN=AROBERT>","Alters, Dennis </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Dalters>, Asante, Ben </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Basante>, Bet...",,,"\LBLAIR (Non-Privileged)\Blair, Lynn\Move",Blair-L,LBLAIR (Non-Privileged).,"Please be informed, Steve Harris and the TW Commercial Group will temporarily relocated to the 13th floor effective ..."
4,22618928.1075853083215.JavaMail.evans@thyme,24 Sep 2001 14:09:13 -0700 (PDT),2001-09-24 17:09:13,donna.scott@enron.com,"darrell.schoolcraft@enron.com, terry.kowalke@enron.com, laura.giambrone@enron.com, tracy.minter@enron.com, amy.mulli...",TW Move Information,"lynn.blair@enron.com, steven.january@enron.com, john.buchanan@enron.com, donna.scott@enron.com",1.0,text/plain; charset=us-ascii,7bit,"Scott, Donna </O=ENRON/OU=NA/CN=RECIPIENTS/CN=DSCOTT1>","Schoolcraft, Darrell </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Dschool>, Kowalke, Terry </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Tkow...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Lblair>, January, Steven </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Sjanuary>, Bu...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Move",Blair-L,LBLAIR (Non-Privileged).,The following individuals are scheduled to move September 27th (Thursday). Please have your boxes and equipment lab...


# 2 Display the top 10 high-frequency users based on weekly numbers of emails sent

In [18]:
df1 = result

In [19]:
from pyspark.sql.functions import udf, expr, substring, expr, regexp_replace, count
from pyspark.sql.functions import unix_timestamp, col, max, min


freq = df1.groupBy('From').agg((count('UTC_timestamp') / ( (max(unix_timestamp(col('UTC_timestamp')))-min(unix_timestamp(col('UTC_timestamp'))))/ 604800)).alias('rate_per_week')).orderBy("rate_per_week",ascending=False)

freq.limit(10).toPandas()



Unnamed: 0,From,rate_per_week
0,drew_a_brabb@calpx.com,310.153846
1,adel.robinson@enron.com,206.52211
2,lynn.blair@enron.com,34.701434
3,sarah.haden@enron.com,20.566066
4,audrey.robertson@enron.com,20.497294
5,dwagman@ftenergy.com,14.058577
6,bill.rapp@enron.com,13.284716
7,bob.stevens@enron.com,13.010789
8,renee.perry@enron.com,10.855895
9,awe@caiso.com,10.776907


# 3 Extract top 20 keywords from the subject text for both

# • for the top 10 high-frequency users and

In [20]:
top = freq.limit(10)

In [21]:
top_subj = df1.join(top, df1["From"] == top["From"], "inner").select(df1['From'], df1['Subject'])
top_subj.show()


+--------------------+--------------------+
|                From|             Subject|
+--------------------+--------------------+
| lynn.blair@enron.co|             FW: PAA|
|audrey.robertson@...|FW: New Location ...|
| lynn.blair@enron.co|RE: Northern v. O...|
| lynn.blair@enron.co|RE: Northern v. O...|
| lynn.blair@enron.co|TW Daily Gas Cont...|
| lynn.blair@enron.co|Re: FW: ETS Staff...|
|sarah.haden@enron.co|EGS and Industry ...|
| lynn.blair@enron.co|   RE: 11-01 payment|
| lynn.blair@enron.co| FW: Agenda addition|
| lynn.blair@enron.co|RE: Branchline Cu...|
|renee.perry@enron.co|RE: North End Tea...|
| lynn.blair@enron.co|FW: NNG/TW Manual...|
| lynn.blair@enron.co|RE: Northern vs. ...|
|sarah.haden@enron.co|EGS and Industry ...|
|  bill.rapp@enron.co|SoCal Gas CPUC Pr...|
|adel.robinson@enr...|Midland Revised W...|
| lynn.blair@enron.co| RE: Reverse Auction|
| lynn.blair@enron.co|Fw: RE: Duluth By...|
| lynn.blair@enron.co|FW: NNG October 2...|
| lynn.blair@enron.co|  RE: Rece

In [22]:
from pyspark.sql.functions import concat_ws, collect_list

top_texts = top_subj.groupBy("From").agg(concat_ws(" ", collect_list("Subject")).alias("texts"))
top_texts = top_texts.select('texts').agg(concat_ws(" ", collect_list("texts")).alias("subjects"))
top_texts.toPandas()

Unnamed: 0,subjects
0,CANCEL Warning Notice Stage 1 CANCELLATION Issue Warning Notice RE: North End Team Shipper/Interconnect Assignments ...


In [23]:
# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
transformed = tokenizer.transform(top_texts)
transformed.toPandas()

Unnamed: 0,subjects,words
0,CANCEL Warning Notice Stage 1 CANCELLATION Issue Warning Notice RE: North End Team Shipper/Interconnect Assignments ...,"[cancel, warning, notice, stage, 1, cancellation, issue, warning, notice, re:, north, end, team, shipper/interconnec..."


### Extend the stop words dictionary by adding your own stop words such as -

In [24]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover

# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "", "fw"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

cleaned.select('filtered').toPandas()

Unnamed: 0,filtered
0,"[cancel, warning, notice, stage, 1, cancellation, issue, warning, notice, north, end, team, shipper/interconnect, as..."


### Extract top 20 keywords by identifying removing the common stop words

In [25]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)
#featured.select('filtered','features').show(1, False)

In [26]:
featured.printSchema()

root
 |-- subjects: string (nullable = false)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)



In [27]:
counts = featured.select('features').collect()

a = cvmodel.vocabulary
b = counts[0]['features'].values

d = {'words':a,'counts':b}
df = pd.DataFrame(d)

df.head(20)

Unnamed: 0,words,counts
0,fw:,416.0
1,meeting,116.0
2,tw,55.0
3,gas,51.0
4,oneok,45.0
5,2001,36.0
6,nng,35.0
7,report,31.0
8,allocation,30.0
9,storage,28.0


# • for the non-high frequency users

In [28]:
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window
w = Window().orderBy(lit('A'))
bottom = freq.orderBy("rate_per_week",ascending=False).withColumn("row_num", row_number().over(w))
bottom = bottom.where(col('row_num')>10).select('From','rate_per_week')
bottom.show()

+--------------------+------------------+
|                From|     rate_per_week|
+--------------------+------------------+
|enron_update@conc...| 5.275006420289082|
| jane.joyce@enron.co| 5.164390073122345|
|arsystem@mailman....| 4.723819045238691|
|kimberly.watson@e...|4.4762200808206405|
|jeffhicken@allian...| 4.056841392124442|
| no.address@enron.co| 4.054096867726274|
|    register@wesc.or| 3.526352409790759|
|   special@flowgo.co|3.5060816078083787|
|darrell.schoolcra...| 3.484493198670269|
|newsletter@quicki...|3.4393434852493514|
|chairman.ken@enro...| 3.408911568204535|
| updates@send4fun.co| 3.025528364562911|
|intelligentxmailb...| 2.678666335967119|
|charlie.thompson@...|2.5801057126646163|
|john.buchanan@enr...|2.5521406572266256|
|drew.fossum@enron.co| 2.395355039038056|
|james.studebaker@...|2.3586809638300483|
|jerry.graves@enro...|2.3227501238569634|
|reyna.cabrera@enr...| 2.321687002787241|
|randy.janzen@enro...| 2.231356685861115|
+--------------------+------------

In [29]:


bottom_subj = df1.join(bottom, df1["From"] == bottom["From"], "inner").select(df1["From"], df1["Subject"])
bottom_subj.show()


+--------------------+--------------------+
|                From|             Subject|
+--------------------+--------------------+
|bradley.holmes@en...|RE: All tests wer...|
|bradley.holmes@en...|ETS Recall NOPR C...|
|bradley.holmes@en...|RE: A draft Busin...|
|bradley.holmes@en...|RE: Draft - For Y...|
|bradley.holmes@en...|Oh, the tangled w...|
|bradley.holmes@en...|Outbound TMS Mess...|
|bradley.holmes@en...|PAA's for Kathy a...|
|bradley.holmes@en...|Personal Best Awa...|
|bradley.holmes@en...|TW Negotiated Rat...|
|bradley.holmes@en...|RE: NNG Letter as...|
|bradley.holmes@en...|PAA's for Kathy a...|
|bradley.holmes@en...|Personal Best Awa...|
|bradley.holmes@en...|Business Objects ...|
|bradley.holmes@en...|Business Objects ...|
|bradley.holmes@en...|        Final review|
|bradley.holmes@en...|2002 Capital IT P...|
|bradley.holmes@en...|Customer Service ...|
|bradley.holmes@en...|System Enhancemen...|
|elizabeth.brown@e...|November 2001 FER...|
|elizabeth.brown@e...|Transweste

In [30]:
from pyspark.sql.functions import concat_ws, collect_list

bottom_texts = bottom_subj.groupBy("From").agg(concat_ws(" ", collect_list("Subject")).alias("texts"))
bottom_texts = bottom_texts.select('texts').agg(concat_ws(" ", collect_list("texts")).alias("subjects"))
bottom_texts.toPandas()

Unnamed: 0,subjects
0,RE: All tests were not completely successfully ETS Recall NOPR Comments RE: A draft Business Continuity Plan for a S...


In [31]:
# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("subjects").setOutputCol("words")
transformed = tokenizer.transform(bottom_texts)
transformed.toPandas()

Unnamed: 0,subjects,words
0,RE: All tests were not completely successfully ETS Recall NOPR Comments RE: A draft Business Continuity Plan for a S...,"[re:, all, tests, were, not, completely, successfully, ets, recall, nopr, comments, re:, a, draft, business, continu..."


### Extend the stop words dictionary by adding your own stop words such as -

In [32]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover

# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "fw:", "", "&"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

cleaned.select('filtered').toPandas()


Unnamed: 0,filtered
0,"[tests, completely, successfully, ets, recall, nopr, comments, draft, business, continuity, plan, short, notice, dra..."


### Extract top 20 keywords by identifying removing the common stop words

In [33]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)

In [34]:
counts = featured.select('features').collect()

a = cvmodel.vocabulary
b = counts[0]['features'].values

d = {'words':a,'counts':b}
df = pd.DataFrame(d)

df.head(20)

Unnamed: 0,words,counts
0,mtg.,281.0
1,conference,274.0
2,room,265.0
3,eb4102,166.0
4,meeting,144.0
5,oncall,120.0
6,weekly,111.0
7,staff,107.0
8,office,99.0
9,team,82.0


# 6 Introduce a new column label to identify new, replied, and forwarded messages

In [35]:
df = result

def to_label(sbj):
    l1 = "RE" if sbj.startswith("RE:") else ("FW" if sbj.startswith("FW:") else 'NEW')
    return l1

udf2 = udf(to_label, StringType())

In [36]:
df_with_label = df.withColumn('label', udf2("Subject"))

In [37]:
df_with_label.limit(5).toPandas()

Unnamed: 0,Message_ID,Date,UTC_timestamp,From,To,Subject,Cc,Mime_Version,Content_Type,Content_Transfer_Encoding,X_From,X_To,X_cc,X_bcc,X_Folder,X_Origin,X_FileName,FYI,label
0,9838605.1075853079790.JavaMail.evans@thyme,23 Aug 2001 09:24:44 -0700 (PDT),2001-08-23 12:24:44,lynn.blair@enron.co,shelley.corman@enron.com,FW: PAA,lynn.blair@enron.com,1.0,text/plain; charset=us-ascii,7bit,"Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=LBLAIR>","Corman, Shelley </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Scorman>","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Lblair>",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Outbox",Blair-L,LBLAIR (Non-Privileged).,"Shelley, Larry Berger has put together the 4 PAA's based on these employeef efforts on the testing of the new Flowin...",FW
1,14271690.1075853083553.JavaMail.evans@thyme,11 Jun 2001 14:06:00 -0700 (PDT),2001-06-11 17:06:00,shelley.corman@enron.co,"steve.hotte@enron.com, steven.january@enron.com, lynn.blair@enron.com, mike.bryant@enron.com",Follow-up on Weekend,gina.taylor@enron.com,1.0,text/plain; charset=us-ascii,7bit,Shelley Corman <Shelley Corman/ENRON@enronXgate>,"Steve Hotte <Steve Hotte/ENRON@enronXgate>, Steven January <Steven January/ET&S/Enron@ENRON>, Lynn Blair <Lynn Blair...",Gina Taylor <Gina Taylor/ENRON@enronXgate>,,"\LBLAIR (Non-Privileged)\Blair, Lynn\Tropical Storm Allison",Blair-L,LBLAIR (Non-Privileged).,I propose that we put in place the following plan to follow-up with employees:1. Thank you from Stan to all employee...,NEW
2,18165833.1075853083239.JavaMail.evans@thyme,26 Sep 2001 06:11:50 -0700 (PDT),2001-09-26 09:11:50,gary.kenagy@enron.co,"ricki.winters@enron.com, console.security@enron.com",RE: Security Access,"lynn.blair@enron.com, rick.dietz@enron.com, sheila.nacey@enron.com, bradley.holmes@enron.com, steven.january@enron.c...",1.0,text/plain; charset=us-ascii,7bit,"Kenagy, Gary </O=ENRON/OU=NA/CN=RECIPIENTS/CN=GKENAGY>","Winters, Ricki </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Rwinter>, Security Console, </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Notesad...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Lblair>, Dietz, Rick </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Rdietz>, Nacey, S...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Move",Blair-L,LBLAIR (Non-Privileged).,"Security, Everyone on the attached list will require after hours access to the common areas on EB39 and EB42. Please...",RE
3,5982727.1075853083190.JavaMail.evans@thyme,20 Sep 2001 14:41:21 -0700 (PDT),2001-09-20 17:41:21,audrey.robertson@enron.co,"dennis.alters@enron.com, ben.asante@enron.com, ramona.betancourt@enron.com, lynn.blair@enron.com, bob.burleson@enron...",FW: New Location for Steve Harris' Staff Meeting,,1.0,text/plain; charset=us-ascii,7bit,"Robertson, Audrey </O=ENRON/OU=NA/CN=RECIPIENTS/CN=AROBERT>","Alters, Dennis </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Dalters>, Asante, Ben </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Basante>, Bet...",,,"\LBLAIR (Non-Privileged)\Blair, Lynn\Move",Blair-L,LBLAIR (Non-Privileged).,"Please be informed, Steve Harris and the TW Commercial Group will temporarily relocated to the 13th floor effective ...",FW
4,22618928.1075853083215.JavaMail.evans@thyme,24 Sep 2001 14:09:13 -0700 (PDT),2001-09-24 17:09:13,donna.scott@enron.co,"darrell.schoolcraft@enron.com, terry.kowalke@enron.com, laura.giambrone@enron.com, tracy.minter@enron.com, amy.mulli...",TW Move Information,"lynn.blair@enron.com, steven.january@enron.com, john.buchanan@enron.com, donna.scott@enron.com",1.0,text/plain; charset=us-ascii,7bit,"Scott, Donna </O=ENRON/OU=NA/CN=RECIPIENTS/CN=DSCOTT1>","Schoolcraft, Darrell </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Dschool>, Kowalke, Terry </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Tkow...","Blair, Lynn </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Lblair>, January, Steven </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Sjanuary>, Bu...",,"\LBLAIR (Non-Privileged)\Blair, Lynn\Move",Blair-L,LBLAIR (Non-Privileged).,The following individuals are scheduled to move September 27th (Thursday). Please have your boxes and equipment lab...,NEW


# 7 Get the trend of the over mail activity using the pivot table from spark itself

In [38]:
from pyspark.sql.functions import hour, year, month

pivotDF = df_with_label.groupBy(year("UTC_timestamp").alias('year'), month("UTC_timestamp").alias('month')).pivot("label").count().orderBy("year", "month")


pivotDF.na.fill(0).toPandas()

Unnamed: 0,year,month,FW,NEW,RE
0,2000,1,0,2,0
1,2000,5,0,3,0
2,2000,6,0,9,0
3,2000,7,1,14,0
4,2000,8,0,41,0
5,2001,5,1,21,3
6,2001,6,21,1259,18
7,2001,7,74,50,40
8,2001,8,54,36,40
9,2001,9,91,84,87


# 8 Use k-means clustering to create 4 clusters from the extracted keywords

In [39]:
raw = result.select("Message_ID","From", "Subject")

#### Extract words from the body of email

In [40]:
#raw = result.select("Message_ID","FYI")

# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("Subject").setOutputCol("words")
transformed = tokenizer.transform(raw)
transformed.limit(1).toPandas()

Unnamed: 0,Message_ID,From,Subject,words
0,9838605.1075853079790.JavaMail.evans@thyme,lynn.blair@enron.co,FW: PAA,"[fw:, paa]"


#### Remove stop words

In [41]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "fw:", "", "&"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned = cleaned.select("Message_ID","words", "filtered")
cleaned.limit(10).toPandas()

Unnamed: 0,Message_ID,words,filtered
0,9838605.1075853079790.JavaMail.evans@thyme,"[fw:, paa]",[paa]
1,14271690.1075853083553.JavaMail.evans@thyme,"[follow-up, on, weekend]","[follow-up, weekend]"
2,18165833.1075853083239.JavaMail.evans@thyme,"[re:, security, access]","[security, access]"
3,5982727.1075853083190.JavaMail.evans@thyme,"[fw:, new, location, for, steve, harris', staff, meeting]","[new, location, steve, harris', staff, meeting]"
4,22618928.1075853083215.JavaMail.evans@thyme,"[tw, move, information]","[tw, move, information]"
5,25384781.1075853023617.JavaMail.evans@thyme,"[your, may, 31, pay, advice]","[may, 31, pay, advice]"
6,28564115.1075861911305.JavaMail.evans@thyme,"[nng, card, list, by, team]","[nng, card, list, team]"
7,6940252.1075861911480.JavaMail.evans@thyme,"[fw:, tw, negotiated, rates, , (message, from, greg, porter)]","[tw, negotiated, rates, (message, greg, porter)]"
8,17994977.1075853083873.JavaMail.evans@thyme,"[re:, tw, index, deals]","[tw, index, deals]"
9,15493730.1075853083897.JavaMail.evans@thyme,"[re:, tw, index, deals]","[tw, index, deals]"


#### Create the features from e-mail subjects using CountVectorizer

In [42]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)

In [43]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=4, seed=1)  # 4 clusters here
model = kmeans.fit(featured.select('features'))

In [44]:
transformed = model.transform(featured)
transformed.limit(5).toPandas()

Unnamed: 0,Message_ID,words,filtered,features,prediction
0,9838605.1075853079790.JavaMail.evans@thyme,"[fw:, paa]",[paa],"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1,14271690.1075853083553.JavaMail.evans@thyme,"[follow-up, on, weekend]","[follow-up, weekend]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
2,18165833.1075853083239.JavaMail.evans@thyme,"[re:, security, access]","[security, access]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
3,5982727.1075853083190.JavaMail.evans@thyme,"[fw:, new, location, for, steve, harris', staff, meeting]","[new, location, steve, harris', staff, meeting]","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,22618928.1075853083215.JavaMail.evans@thyme,"[tw, move, information]","[tw, move, information]","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [45]:
transformed.groupBy("prediction").count().orderBy("prediction").show()

+----------+-----+
|prediction|count|
+----------+-----+
|         0| 2895|
|         1|  258|
|         2|  247|
|         3|   45|
+----------+-----+



# 9 Use LDA to generate 4 topics from the extracted keywords

In [46]:
from gensim import corpora, models

def get_topic(cluster):
    #list_of_list_of_tokens
    list_of_list_of_tokens  = [row.filtered for row in transformed.filter(col("prediction")==cluster).select("filtered").collect()]


    dictionary_LDA = corpora.Dictionary(list_of_list_of_tokens)
    #dictionary_LDA.filter_extremes(no_below=3)
    corpus = [dictionary_LDA.doc2bow(list_of_tokens) for list_of_tokens in list_of_list_of_tokens]

    num_topics = 1
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    string = ""
    for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=6):
        string += (str(i)+": "+ topic)
        string += ("\n")
    
    return string


In [47]:
print(get_topic(0))

0: 0.034*"oncall" + 0.025*"office" + 0.015*"lynn" + 0.015*"john" + 0.014*"mtg." + 0.013*"terry"



In [48]:
print(get_topic(1))

0: 0.240*"meeting" + 0.031*"staff" + 0.023*"tw" + 0.018*"winter" + 0.016*"gas" + 0.016*"room"



In [49]:
print(get_topic(2))

0: 0.159*"conference" + 0.158*"room" + 0.150*"mtg." + 0.102*"eb4102" + 0.036*"staff" + 0.032*"weekly"



In [50]:
print(get_topic(3))

0: 0.326*"mtg./raetta" + 0.326*"priority" + 0.326*"weekly" + 0.007*"delete" + 0.007*"parent" + 0.007*"repeat"



In [18]:
raw = result.select("Message_ID","From", "FYI")

In [19]:
def lower_clean_str(x):
    punc='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    lowercased_str = x.lower()
    for ch in punc:
        lowercased_str = lowercased_str.replace(ch, '')
    return lowercased_str

udf3 = udf (lower_clean_str, StringType())


In [20]:
raw = raw.select("Message_ID","From", udf3("FYI").alias('FYI'))

In [21]:
raw.limit(5).toPandas()

Unnamed: 0,Message_ID,From,FYI
0,9838605.1075853079790.JavaMail.evans@thyme,lynn.blair@enron.com,shelley larry berger has put together the 4 paas based on these employeef efforts on the testing of the new flowing ...
1,14271690.1075853083553.JavaMail.evans@thyme,shelley.corman@enron.com,i propose that we put in place the following plan to followup with employees1 thank you from stan to all employees t...
2,18165833.1075853083239.JavaMail.evans@thyme,gary.kenagy@enron.com,security everyone on the attached list will require after hours access to the common areas on eb39 and eb42 please c...
3,5982727.1075853083190.JavaMail.evans@thyme,audrey.robertson@enron.com,please be informed steve harris and the tw commercial group will temporarily relocated to the 13th floor effective m...
4,22618928.1075853083215.JavaMail.evans@thyme,donna.scott@enron.com,the following individuals are scheduled to move september 27th thursday please have your boxes and equipment labele...


In [22]:
#raw = result.select("Message_ID","FYI")

# Extract word
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol("FYI").setOutputCol("words")
transformed = tokenizer.transform(raw)
transformed.limit(1).toPandas()

Unnamed: 0,Message_ID,From,FYI,words
0,9838605.1075853079790.JavaMail.evans@thyme,lynn.blair@enron.com,shelley larry berger has put together the 4 paas based on these employeef efforts on the testing of the new flowing ...,"[shelley, larry, berger, has, put, together, the, 4, paas, based, on, these, employeef, efforts, on, the, testing, o..."


In [23]:
# Remove stopwords
from pyspark.ml.feature import StopWordsRemover
# custom stopwords
stopwords = StopWordsRemover().getStopWords() + ["-", "re:", "fw:", "", "&"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned = cleaned.select("Message_ID","words", "filtered")
cleaned.limit(10).toPandas()

Unnamed: 0,Message_ID,words,filtered
0,9838605.1075853079790.JavaMail.evans@thyme,"[shelley, larry, berger, has, put, together, the, 4, paas, based, on, these, employeef, efforts, on, the, testing, o...","[shelley, larry, berger, put, together, 4, paas, based, employeef, efforts, testing, new, flowing, gas, documents, g..."
1,14271690.1075853083553.JavaMail.evans@thyme,"[i, propose, that, we, put, in, place, the, following, plan, to, followup, with, employees1, thank, you, from, stan,...","[propose, put, place, following, plan, followup, employees1, thank, stan, employees, worked, emergency, plan, weeken..."
2,18165833.1075853083239.JavaMail.evans@thyme,"[security, everyone, on, the, attached, list, will, require, after, hours, access, to, the, common, areas, on, eb39,...","[security, everyone, attached, list, require, hours, access, common, areas, eb39, eb42, please, configure, access, f..."
3,5982727.1075853083190.JavaMail.evans@thyme,"[please, be, informed, steve, harris, and, the, tw, commercial, group, will, temporarily, relocated, to, the, 13th, ...","[please, informed, steve, harris, tw, commercial, group, temporarily, relocated, 13th, floor, effective, monday, sep..."
4,22618928.1075853083215.JavaMail.evans@thyme,"[the, following, individuals, are, scheduled, to, move, september, 27th, thursday, , please, have, your, boxes, and,...","[following, individuals, scheduled, move, september, 27th, thursday, please, boxes, equipment, labeled, 530, pm, rec..."
5,25384781.1075853023617.JavaMail.evans@thyme,"[this, message, is, to, inform, you, that, information, contained, on, your, printed, pay, advice, for, the, pay, pe...","[message, inform, information, contained, printed, pay, advice, pay, period, ending, may, 31, 2001, may, misleadingr..."
6,28564115.1075861911305.JavaMail.evans@thyme,"[see, the, attached, for, the, nng, card, list, by, team, sc, southcentral, team, nnorth, team, and, scn, means, bot...","[see, attached, nng, card, list, team, sc, southcentral, team, nnorth, team, scn, means, claim, customer, john, buch..."
7,6940252.1075861911480.JavaMail.evans@thyme,"[per, our, conversation, at, staff, meeting, today, , please, see, discussion, of, lookup, capacity, below]","[per, conversation, staff, meeting, today, please, see, discussion, lookup, capacity]"
8,17994977.1075853083873.JavaMail.evans@thyme,"[alllooking, back, through, my, postings, for, tw, there, was, only, one, posting, in, the, two, month, period, in, ...","[alllooking, back, postings, tw, one, posting, two, month, period, question, pertained, available, capacity, open, s..."
9,15493730.1075853083897.JavaMail.evans@thyme,"[i, found, one, note, dated, 12800, asking, toby, kuehl, to, post, jan, 2001, lft, capacity, for, one, day, , , this...","[found, one, note, dated, 12800, asking, toby, kuehl, post, jan, 2001, lft, capacity, one, day, capacity, sold, semp..."


In [28]:
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)


KeyboardInterrupt: 

In [None]:
# Generate features
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = cvmodel.transform(cleaned)

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=4, seed=1)  # 4 clusters here
model = kmeans.fit(featured.select('features'))