## Plot Topics Extraction with NMF Tutorial

In [None]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

Import packages 

In [None]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

Define variables

In [None]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

Create new method

In [1]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

Load the dataset and filter the out unnecessary columns like 'headers', 'footers' and 'quotes'. Then, print the time it took to complete this process.

In [None]:
# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
data, _ = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             return_X_y=True)
data_samples = data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Extract tf-id features for NMF. The tfidf_vectorizer method will take a collection of raw documents and convert them into a matrix.

In [None]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))


Use the CountVectorizer feature to convert a collection of text documents to a matrix of token counts.

In [None]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()


Fit the NMF model using tf-idf features and then print the topics.

In [None]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Fit the NMF model using the Kullback-Leibler divergence method.

In [None]:
# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

Print the topics of NMF model by Kullback-Leibler divergence.

In [None]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Print the LDA model with the tf features and the time it takes to compelte the process. 

In [None]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Print the topics in the LDA model.

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Homework 2: Analyzing

Define the cross validation model with 10 folds

In [None]:
X, y, X_test = load()
y = y.values   # to numpy
X = X.values
if not model:
    model = load_model()

Load and define the values and convert to numpy

In [None]:
def analyze_model(model=None, folds=10):

Manual x-validation to accumulate actual

In [None]:
cv_skf = StratifiedKFold(y, n_folds=folds, shuffle=False, random_state=42)
scores = []
conf_mat = np.zeros((2, 2))      # Binary classification
false_pos = Set()
false_neg = Set()

For loop to traverse all the values for the prediction.

In [None]:
for train_i, val_i in cv_skf:
    X_train, X_val = X[train_i], X[val_i]
    y_train, y_val = y[train_i], y[val_i]

Fit the fold and print it.

In [None]:
print "Fitting fold..."
    model.fit(X_train, y_train)

Predict the fold and print it.

In [None]:
print "Predicting fold..."
    y_pprobs = model.predict_proba(X_val)      
    y_plabs = np.squeeze(model.predict(X_val))  
    
    scores.append(roc_auc_score(y_val, y_pprobs[:, 1]))
    confusion = confusion_matrix(y_val, y_plabs)
    conf_mat += confusion

Collect information on false postive and negatives

In [None]:
fp_i = np.where((y_plabs==1) & (y_val==0))[0]
    fn_i = np.where((y_plabs==0) & (y_val==1))[0]
    false_pos.update(val_i[fp_i])
    false_neg.update(val_i[fn_i])

Print all results

In [None]:
print "Fold score: ", scores[-1]
print "Fold CM: \n", confusion

print "\nMean score: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2)
conf_mat /= folds
print "Mean CM: \n", conf_mat
print "\nMean classification measures: \n"
pprint(class_report(conf_mat))
return scores, conf_mat, {'fp': sorted(false_pos), 'fn': sorted(false_neg)}