In [0]:
import pandas as pd
import zipfile
import numpy as np
import regex as re
import nltk
from sklearn.decomposition import NMF,LatentDirichletAllocation 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from nltk.stem import WordNetLemmatizer

In [3]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

kaggle.json


In [4]:
!kaggle datasets download -d wcukierski/enron-email-dataset

Downloading enron-email-dataset.zip to /content
 97% 347M/358M [00:08<00:00, 52.1MB/s]
100% 358M/358M [00:08<00:00, 45.5MB/s]


In [5]:
datafolder = zipfile.ZipFile('/content/enron-email-dataset.zip')
datafolder.filelist

[<ZipInfo filename='emails.csv' compress_type=deflate file_size=1426122219 compress_size=375294799>]

In [0]:
lem = WordNetLemmatizer()
vect = TfidfVectorizer()
lda = LatentDirichletAllocation(n_components=5, random_state=0)

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
stopword = set(stopwords.words('english'))

In [0]:
df = pd.read_csv(datafolder.open('emails.csv'),header=0)

In [0]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [0]:
def pipeline(text):
  text = text.rstrip()
  stop_free = ' '.join([word for word in text.lower().split() if ((word not in stopword))])
  punc_free=re.sub('[^a-zA-Z]', " ", str(stop_free))
  text = ' '.join(lem.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(punc_free))
  return text

In [0]:
data = df[:100]

In [20]:
data['file_pipeline'] = data['file'].apply(pipeline)
data['message_pipeline'] = data['message'].apply(pipeline)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [0]:
model = vect.fit_transform(data['message_pipeline'])

In [0]:
model = pd.DataFrame(model.toarray(), columns=vect.get_feature_names())

In [0]:
nmf_model = NMF(n_components=4, init='nndsvd', random_state=0)

In [24]:
W = nmf_model.fit_transform(model)
print(W)

[[2.08929169e-01 5.76704702e-03 0.00000000e+00 0.00000000e+00]
 [1.11168460e-01 3.81965253e-02 6.83563801e-02 0.00000000e+00]
 [2.17374614e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.09420408e-01 0.00000000e+00 9.01437583e-03 0.00000000e+00]
 [2.66301081e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.48419786e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.52461547e-01 2.38705103e-02 0.00000000e+00 0.00000000e+00]
 [2.23507998e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.76078781e-01 1.01390498e-02 0.00000000e+00 8.66425374e-02]
 [4.04256517e-02 3.86029262e-02 8.86294019e-02 1.92910786e-01]
 [1.40774562e-01 9.78846359e-03 9.72176142e-03 9.69280442e-02]
 [1.63747541e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.99837333e-01 9.52341359e-02 1.92269124e-01]
 [0.00000000e+00 1.99837333e-01 9.52341359e-02 1.92269124e-01]
 [2.23398954e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.15026410e-01 0.00000000e+00 0.00000000e+00 0.000000

In [25]:
H = nmf_model.components_
print(H)

[[0.         0.00213423 0.         ... 0.         0.         0.00144943]
 [0.02467442 0.00078329 0.         ... 0.         0.00231798 0.        ]
 [0.03550698 0.02590678 0.03549846 ... 0.03740361 0.00072485 0.01295388]
 [0.02275509 0.05077378 0.00794782 ... 0.         0.         0.        ]]


In [0]:
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

In [27]:
lda.fit(model)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [28]:
lda.components_

array([[0.20003215, 0.20003424, 0.31409973, ..., 0.32103956, 0.20000758,
        0.20001766],
       [0.22219271, 0.50462095, 0.2119271 , ..., 0.20198485, 0.2033532 ,
        0.20531643],
       [0.20003262, 0.20003478, 0.20003099, ..., 0.20000773, 0.20000778,
        0.20001726],
       [0.27817928, 0.3225747 , 0.32723095, ..., 0.20000724, 0.21831234,
        0.20001622],
       [0.6594826 , 0.22699222, 0.20003002, ..., 0.2000073 , 0.20000751,
        0.28479743]])