In [46]:
from bs4 import BeautifulSoup
import nltk
import re
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

from sklearn.externals import joblib

%matplotlib inline

In [3]:
def HTML_TO_TEXT(html):
    soup = BeautifulSoup(html,"lxml")
    all_text = soup.findAll(text = True)
    text = ''.join(all_text)
    
    return(text)

In [15]:
def tokenize_and_stem(summary):
    
    tokens = [word for sent in nltk.sent_tokenize(summary) for word in nltk.word_tokenize(sent)]
    
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[a-zA-Z]',token):
            filtered_tokens.append(token)
    
    #stem each of the filtered tokens
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    stems = [stemmer.stem(t) for t in filtered_tokens]
      
    return(stems)

In [16]:
def tokenize(summary):
    
    tokens = [word.lower() for sent in nltk.sent_tokenize(summary) for word in nltk.word_tokenize(sent)]
    
    filtered_tokens = []
    
    for token in tokens:
        if re.search('[a-zA-Z]',token):
            filtered_tokens.append(token)
            
    return(filtered_tokens)

In [20]:
def retrieve_link(summary):
    return(re.findall(r'(http(s)?://[^\s]+)',summary))

    

In [21]:
def process_data(data):
    pdata = data
    pdata['summary'],pdata['links'],pdata['stemmed'],pdata['tokens']='','','',''
    
    
    pdata['summary'] = pdata['content'].apply(HTML_TO_TEXT) #converting html to text
    pdata['summary'] = pdata['summary'].str.lower()
    pdata['links'] = pdata['content'].apply(retrieve_link)    
    pdata['stemmed'] = pdata['summary'].apply(tokenize_and_stem)
    pdata['tokens'] = pdata['summary'].apply(tokenize)
    
    return(pdata)

In [29]:
def create_vocab_frame(data):
    
    all_words_stemmed = []
    all_words_tokenized = []
    
   
    for i in data['summary']:
        words_stemmed = tokenize_and_stem(i)
        all_words_stemmed.extend(words_stemmed)
        
        words_tokenized = tokenize(i)
        all_words_tokenized.extend(words_tokenized)
        
    return(pd.DataFrame({'words': all_words_tokenized}, index=all_words_stemmed))

In [35]:
#Generate tfidf_matrix

def generate_tfidf(data):
    
    tfidfvectorizer = TfidfVectorizer(max_df=0.85, max_features=200000,min_df=0.20,stop_words='english',use_idf=True,
                                      tokenizer = tokenize_and_stem, ngram_range = (1,3))
    
    tfidf_matrix = tfidfvectorizer.fit_transform(data)
    terms = tfidfvectorizer.get_feature_names()
    distance = 1 - cosine_similarity(tfidf_matrix)
    
    return(tfidf_matrix,terms,distance)

In [38]:
def determine_num_clusters_k(data):
    ks = range(1,20)
    inertias = []
    
    for k in ks:
        kmeans_model = KMeans(n_clusters=k)
        kmeans_model.fit(data)
        inertias.append(kmeans_model.inertia_)
    
    
    plt.plot(ks,inertias,'-o')
    plt.xlabel('Number of clusters k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()

In [41]:
def perform_kmeans(data,num_clusters):
    
    kmeans_model = KMeans(n_clusters=num_clusters)
    kmeans_model.fit(data)
    clusters = kmeans_model.labels_
    
    joblib.dump(kmeans_model,'kmeans_model.pkl') #save kmeans model
    return(clusters)

In [45]:
def print_headlines(data,num_clusters,num_headlines):
    
    for n in range(num_clusters):
        print('-----------------------------------------------------------------------------')
        counter = 0
        for i in range(len(data)):
            if data['cluster'][i] == n and counter < num_headlines:
                print(data['headline'][i])
                print("\n")
                counter+=1

In [47]:
def perform_nmf(tfidf_matrix,num_components):
    
    nmf_model = NMF(n_components=num_components)
    nmf_model.fit(tfidf_matrix)
    
    nmf_features = nmf_model.transform(tfidf_matrix)
    nmf_components = nmf_model.components_
    
    return(nmf_features,nmf_components)