# Enron Latent Dirichlet Allocation Analysis
<hr>
**Author: ** *Gilberto Diaz*

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('enron_lda').getOrCreate()

In [2]:
data = spark.read.csv('./user_and_emails.csv', header=True)

In [3]:
data.show()

+-------+--------------------+
|   user|          email_body|
+-------+--------------------+
|allen-p|Here is our forec...|
|allen-p|Traveling to have...|
|allen-p|test successful  ...|
|allen-p|Randy    Can you ...|
|allen-p|Let s shoot for T...|
|allen-p|Greg    How about...|
|allen-p|Please cc the fol...|
|allen-p|any morning betwe...|
|allen-p|1  login   pallen...|
|allen-p|                 ...|
|allen-p|Mr  Buckner    Fo...|
|allen-p|Lucy    Here are ...|
|allen-p|                 ...|
|allen-p|                 ...|
|allen-p|Dave     Here are...|
|allen-p|Paula    35 milli...|
|allen-p|                 ...|
|allen-p|Tim   mike grigsb...|
|allen-p|                 ...|
|allen-p|                 ...|
+-------+--------------------+
only showing top 20 rows



### ELT

In [4]:
from pyspark.ml.feature import (StopWordsRemover, 
                                Tokenizer, 
                                CountVectorizer, 
                                RegexTokenizer,
                                IDF)
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.clustering import LDA

In [5]:
data.show()

+-------+--------------------+
|   user|          email_body|
+-------+--------------------+
|allen-p|Here is our forec...|
|allen-p|Traveling to have...|
|allen-p|test successful  ...|
|allen-p|Randy    Can you ...|
|allen-p|Let s shoot for T...|
|allen-p|Greg    How about...|
|allen-p|Please cc the fol...|
|allen-p|any morning betwe...|
|allen-p|1  login   pallen...|
|allen-p|                 ...|
|allen-p|Mr  Buckner    Fo...|
|allen-p|Lucy    Here are ...|
|allen-p|                 ...|
|allen-p|                 ...|
|allen-p|Dave     Here are...|
|allen-p|Paula    35 milli...|
|allen-p|                 ...|
|allen-p|Tim   mike grigsb...|
|allen-p|                 ...|
|allen-p|                 ...|
+-------+--------------------+
only showing top 20 rows



### Adding index column

In [6]:
data_with_id = data.withColumn('id', monotonically_increasing_id())

In [7]:
data_with_id.show()

+-------+--------------------+---+
|   user|          email_body| id|
+-------+--------------------+---+
|allen-p|Here is our forec...|  0|
|allen-p|Traveling to have...|  1|
|allen-p|test successful  ...|  2|
|allen-p|Randy    Can you ...|  3|
|allen-p|Let s shoot for T...|  4|
|allen-p|Greg    How about...|  5|
|allen-p|Please cc the fol...|  6|
|allen-p|any morning betwe...|  7|
|allen-p|1  login   pallen...|  8|
|allen-p|                 ...|  9|
|allen-p|Mr  Buckner    Fo...| 10|
|allen-p|Lucy    Here are ...| 11|
|allen-p|                 ...| 12|
|allen-p|                 ...| 13|
|allen-p|Dave     Here are...| 14|
|allen-p|Paula    35 milli...| 15|
|allen-p|                 ...| 16|
|allen-p|Tim   mike grigsb...| 17|
|allen-p|                 ...| 18|
|allen-p|                 ...| 19|
+-------+--------------------+---+
only showing top 20 rows



### Tokenizing and removing empty tokens

In [8]:
regex_tokenizer = RegexTokenizer(inputCol='email_body', outputCol='tokens', pattern='\\W')

In [9]:
regex_tokenized = regex_tokenizer.transform(data_with_id)

In [10]:
regex_tokenized.show()

+-------+--------------------+---+--------------------+
|   user|          email_body| id|              tokens|
+-------+--------------------+---+--------------------+
|allen-p|Here is our forec...|  0|[here, is, our, f...|
|allen-p|Traveling to have...|  1|[traveling, to, h...|
|allen-p|test successful  ...|  2|[test, successful...|
|allen-p|Randy    Can you ...|  3|[randy, can, you,...|
|allen-p|Let s shoot for T...|  4|[let, s, shoot, f...|
|allen-p|Greg    How about...|  5|[greg, how, about...|
|allen-p|Please cc the fol...|  6|[please, cc, the,...|
|allen-p|any morning betwe...|  7|[any, morning, be...|
|allen-p|1  login   pallen...|  8|[1, login, pallen...|
|allen-p|                 ...|  9|[forwarded, by, p...|
|allen-p|Mr  Buckner    Fo...| 10|[mr, buckner, for...|
|allen-p|Lucy    Here are ...| 11|[lucy, here, are,...|
|allen-p|                 ...| 12|[forwarded, by, p...|
|allen-p|                 ...| 13|[forwarded, by, p...|
|allen-p|Dave     Here are...| 14|[dave, here, a

In [63]:
stop_words = StopWordsRemover()
new_stop_words = [
    '00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '0', '1', 
    '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '15', 
    '16', '17', '18', '19','20', '25', 'www', 'com', 'pm', '853', 'click',
    'mail', 'e', 'http', 'na', 'ees', 'cc', 'hou', 'etc', '30', 'll', '35',
    'a','cannot','into','our','thus','about','co','is','ours','to','above',
    'could','it','ourselves','together','across','down','its','out','too',
    'after','during','itself','over','toward','afterwards','each','last',
    'own', 'towards','again','eg','latter','per','under','against','either',
    'latterly', 'perhaps','until','all','else','least','rather','up','almost',
    'elsewhere', 'less','same','upon','alone','enough','seem','us', 'said',
    'along','etc', 'many','seemed','very','already','even','may','seeming','via',
    'also','ever', 'me','seems','was','although','every','meanwhile','several',
    'always', 'everyone','might','she','well','among','everything','more','should',
    'were', 'amongst','everywhere','moreover','since','what','an','except','most',
    'whatever','and','few','mostly','some','when','another','first','much', 
    'somehow','whence','any','for','must','someone','whenever','anyhow', 'mr', 
    'my','something','where','anyone','formerly','myself','sometime', 'gif',
    'whereafter','anything','from','namely','sometimes','whereas','anywhere', 
    'further','neither','somewhere','whereby','are','had','never','still', 'let',
    'wherein','around','has','nevertheless','such','whereupon','as','have', 'ip',
    'next','than','wherever','at','he','no','that','whether','be','hence',
    'nobody','the','whither','became','her','none','their','which','because', 'send',
    'here','noone','them','while','become','hereafter','nor','themselves','who',
    'becomes','hereby','not','then','whoever','becoming','herein','nothing',
    'thence','whole','been','hereupon','now','there','whom','before','hers', 'ski',
    'nowhere','thereafter','whose','beforehand','herself','of','thereby','why', 
    'especially', 'image', 're', 'we', 'so', 'static', 'width',
    'behind','him','off','therefore','will','being','himself','often','therein', 
    'with','below','his','on','thereupon','within','beside','how','once', 'try', 
    'these','without','besides','however','one','they','would','between','i', 'far',
    'only','this','yet','beyond','ie','onto','those','you','both','if','or', 'get',
    'though','your','but','in','other','through','yours','by','inc','others', 'suggest', 
    'take', 'throughout','yourself','can','indeed','otherwise','thru','yourselves', 
    'login', 'please', 'forwarded', 'pw', 'k', '-', '+', '|', ' ', 'go', 'takes', 
    'td', 'font', 'br', 'b', 'tr', 'm', 'align', 'net', '3d', '2001', 'new', 
    'said', '11', 'ect', '2000', 'sent', 'know', 'dbcaps97data',
    '12', 'need', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '12', 'aol',
    '2002', 'mailto', '713', 'error', 'nbsp', 'et', 
    'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
    ]

all_stop_words = stop_words.getStopWords() + new_stop_words
stop_words_set = set(all_stop_words)
stop_words_set = list(stop_words_set)

remover = StopWordsRemover(inputCol='tokens', outputCol='words', stopWords=stop_words_set)

In [64]:
cleaned = remover.transform(regex_tokenized)

In [65]:
# cv = CountVectorizer(inputCol='words', outputCol='vectors') # to user with IDF
cv = CountVectorizer(inputCol='words', outputCol='features')

In [66]:
count_vectorizer_model = cv.fit(cleaned)

In [67]:
result = count_vectorizer_model.transform(cleaned)

In [68]:
vocab = count_vectorizer_model.vocabulary

In [51]:
# idf = IDF(inputCol='vectors', outputCol='features')

In [52]:
# idf_model = idf.fit(result)

In [53]:
# rescale_data = idf_model.transform(result)

In [69]:
# rescale_data.show()
result.show()

+-------+--------------------+---+--------------------+--------------------+--------------------+
|   user|          email_body| id|              tokens|               words|            features|
+-------+--------------------+---+--------------------+--------------------+--------------------+
|allen-p|Here is our forec...|  0|[here, is, our, f...|          [forecast]|(262144,[1921],[1...|
|allen-p|Traveling to have...|  1|[traveling, to, h...|[traveling, busin...|(262144,[6,16,29,...|
|allen-p|test successful  ...|  2|[test, successful...|[test, successful...|(262144,[60,959,1...|
|allen-p|Randy    Can you ...|  3|[randy, can, you,...|[randy, schedule,...|(262144,[31,113,1...|
|allen-p|Let s shoot for T...|  4|[let, s, shoot, f...|[shoot, tuesday, 45]|(262144,[82,407,6...|
|allen-p|Greg    How about...|  5|[greg, how, about...|[greg, tuesday, t...|(262144,[77,82,46...|
|allen-p|Please cc the fol...|  6|[please, cc, the,...|[following, distr...|(262144,[0,25,53,...|
|allen-p|any morning

### Training data

In [70]:
lda_model = LDA(k=4)

In [71]:
model = lda_model.fit(result)

In [72]:
topics = model.describeTopics()

In [73]:
topics.show()

+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[41, 45, 8, 493, ...|[0.00505194404606...|
|    1|[132, 358, 390, 1...|[0.00891474809031...|
|    2|[0, 1, 4, 5, 9, 1...|[0.04050886364638...|
|    3|[0, 2, 3, 8, 14, ...|[0.00841943508153...|
+-----+--------------------+--------------------+



In [74]:
topics_rdd = topics.rdd

In [75]:
topics_words = topics_rdd\
    .map(lambda row: row['termIndices'])\
    .map(lambda inx_list: [vocab[idx] for idx in inx_list])\
    .collect()

In [76]:
for idx, topic in enumerate(topics_words):
    print(f'Topic: {idx+1}')
    print('===============')
    for word in topic:
        print(word)
    print('\n')

Topic: 1
000
million
company
omni
gas
50
market
firm
31
database


Topic: 2
size
href
fantasy
final
class
table
face
sportsline
arial
updated


Topic: 3
enron
subject
corp
message
thanks
original
time
call
attached
mark


Topic: 4
enron
power
energy
company
state
california
market
gas
year
business




I played around with different setups trying to get coherent topics. With the IDF packages, words in topics where all over the place. This current setup gave me the most coherent topics. I'm open to learn how to fine tune passing different parameters.

### Topic: 4
I will classify this topic as very corporate conversations.

### Topic: 3
I will classify this topic as meeting conversations since have words such as call, time, attached, and mark.

### Topic: 2
I don't have any classification for this topic.

### Topic: 1
I will classify this topic as stock market


## TODO:
<hr>
- Gain more knowledge about the domain for fine tuning stop words.
- Find resources to learn how to fine tune by tweaking function's parameters.