In [1]:
!ls

#README.md#                  articles2.csv
Megatable-2014.sql           articles3.csv
Megatable-2015.sql           fletcher.ipynb
Megatable-2016.sql           keep
README.md                    mvp.md
README.md~                   mvp.md~
[34mReuters-full-data-set-master[m[m odyssey.txt
articles1.csv                uci-news-aggregator.csv


In [1]:
import pandas as pd
import pickle
import sqlite3
from sklearn.model_selection import train_test_split

In [2]:
# Load data

articles1 = pd.read_csv("articles1.csv")
articles2 = pd.read_csv("articles2.csv")
articles3 = pd.read_csv("articles3.csv")
articles = pd.concat([articles1, articles2, articles3])

# This helps to cut down the volume of data I'm working with
articles_train, articles_test = train_test_split(articles, test_size=0.5)

uci = pd.read_csv("uci-news-aggregator.csv")

In [3]:
uci_lat = uci[uci.PUBLISHER == "Los Angeles Times"]
uci_lat.CATEGORY.value_counts()

e    1001
b     468
t     363
m     144
Name: CATEGORY, dtype: int64

In [4]:
articles_train_content = [articles_train.iloc[k].content for k in range(len(articles_train))]

## SVD/LSA

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)

cv_data = count_vectorizer.fit_transform(articles_train_content)
tfidf_data = tfidf_vectorizer.fit_transform(articles_train_content)

In [None]:
from sklearn.decomposition import NMF, TruncatedSVD

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
        
n_comp = 20
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_cv = TruncatedSVD(n_components=n_comp)
nmf_cv = NMF(n_components=n_comp)

lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)
lsa_cv_data = lsa_cv.fit_transform(cv_data)
nmf_cv_data = nmf_cv.fit_transform(cv_data)        

In [None]:
display_topics(lsa_tfidf, tfidf_vectorizer.get_feature_names(),10)

## Rabbit hole of LDA

In [42]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Start w/ analysis of articles w/ LDA

# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
count_vectorizer.fit(articles_train_content)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [44]:
# Create the term-document matrix
# Transpose it so the terms are the rows
counts = count_vectorizer.transform(articles_train_content).transpose()

In [48]:
counts.shape

(11453912, 71285)

In [50]:
from gensim import corpora, models, similarities, matutils

# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(counts)

In [51]:
id2word = { identifier: word for word, identifier in count_vectorizer.vocabulary_.items()}

In [65]:
len(id2word)

11453912

In [67]:
# This takes forever to run
# See W7D3 - Article Recommender for SVD / LSA combo by Damien, should be faster.

# lda = models.LdaModel(corpus=corpus, num_topics=5, minimum_probability=0.03, id2word=id2word, passes=10)

KeyboardInterrupt: 

In [80]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   analyzer='word')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),  
                                   stop_words='english',
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   analyzer='word')
# ,
#                                    max_df = 0.6)

cv_data = count_vectorizer.fit_transform(odyssey2)
tfidf_data = tfidf_vectorizer.fit_transform(odyssey2)

In [81]:
from sklearn.decomposition import NMF, TruncatedSVD

n_comp = 50
lsa_tfidf = TruncatedSVD(n_components=n_comp)
lsa_tfidf_data = lsa_tfidf.fit_transform(tfidf_data)

In [79]:
lsa_tfidf_data

array([[ 1.23052284e-02, -6.58876353e-05,  2.58473976e-01, ...,
         1.78939880e-03, -1.05804688e-03, -9.17837050e-03],
       [ 2.29088249e-02, -2.55092247e-04,  2.98032143e-01, ...,
         2.20314773e-02,  1.13149776e-02, -6.10096828e-03],
       [ 2.05096346e-02, -2.23758556e-04,  8.14734017e-01, ...,
        -1.59495976e-03, -3.02551279e-03,  6.13386481e-03],
       ...,
       [ 2.05096346e-02, -2.23758556e-04,  8.14734017e-01, ...,
        -1.59495976e-03, -3.02551279e-03,  6.13386481e-03],
       [ 2.49251427e-02, -2.86461955e-04,  3.51705685e-01, ...,
        -3.43923884e-03, -2.75584738e-04, -3.27308806e-02],
       [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00, -0.00000000e+00]])