In [None]:
from azure.storage.blob import BlockBlobService
from time import time


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas.io.json import json_normalize

STORAGEACCOUNTNAME= 'knowstoredemo'
STORAGEACCOUNTKEY= ''
LOCALFILENAME= 'temp.json'
CONTAINERNAME= 'hotelreviews'
cols_to_del  = ['metadata_storage_content_type', 'metadata_storage_size', 'metadata_storage_path', 'metadata_storage_content_md5', 
             'metadata_storage_last_modified', 'latitude','longitude', 'reviews_date','reviews_dateAdded', 
             'reviews_username', 'metadata_storage_name']
BLOBNAME= ''
big_df = pd.DataFrame()
#download from blob
count = 0
blob_service=BlockBlobService(account_name=STORAGEACCOUNTNAME,account_key=STORAGEACCOUNTKEY)
blob_names = blob_service.list_blobs(CONTAINERNAME)

t0 = time()
for blob in blob_names:
    blob_service.get_blob_to_path(CONTAINERNAME,blob.name,LOCALFILENAME)
    df = pd.read_json(LOCALFILENAME)
    
    
    for col in cols_to_del:
        if col in df.columns:
            del df[col]
    try:
        
        pages = json_normalize(df['pages'])
        df['LanguageCode' ] = pages['LanguageCode']
        df['SentimentScore'] = pages ['SentimentScore']
        #df.drop(['metadata_storage_content_type', 'metadata_storage_size', 'metadata_storage_path', 'metadata_storage_content_md5', 
        #         'metadata_storage_last_modified', 'latitude','longitude', 'reviews_date','reviews_dateAdded', 
        #         'reviews_username', 'metadata_storage_name'],axis=1, inplace=True, sort=False)

        #df1 = df[['address', 'categories', 'city', 'country', 'postalCode', 'province', 'name', 'reviews_rating', 'reviews_text', 'reviews_title', 'pages', 'AzureSearch_DocumentKey']]
        frames = [big_df, df]
        big_df= pd.concat(frames, sort=False)
        count = count+ 1
        if count % 500 == 0:
            print("done in %0.3fs." % (time() - t0))
            print(f'Processing {count} documents' )
            
    except:
        print("Exception")
        #print(AzureSearch_DocumentKey)
        print(df['pages'])
        
print(f'Processed {count} documents' )
print(big_df.columns)
print(big_df.shape)

done in 41.168s.
Processing 500 documents
done in 81.580s.
Processing 1000 documents


In [2]:

n_features = 5000
n_components = 10
n_top_words = 20
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [3]:
x = big_df.loc[big_df['LanguageCode'] == "en"]
x.head()
data = x['reviews_text'].values.tolist()
n_samples = len(data)

In [4]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data)

print("done in %0.3fs." % (time() - t0))
print()

Extracting tf features for LDA...
done in 0.047s.



In [5]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data)

print("done in %0.3fs." % (time() - t0))


Extracting tf-idf features for NMF...
done in 0.065s.


In [6]:
# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)

#          alpha=.1, l1_ratio=.5).fit(n_tfidf)
print("done in %0.3fs." % (time() - t0))

Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=500 and n_features=5000...
done in 0.127s.


In [7]:
print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

print_top_words(nmf, tfidf_feature_names, n_top_words)


# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=8000, alpha=.1,
          l1_ratio=.5).fit(tfidf)

print("done in %0.3fs." % (time() - t0))


Topics in NMF model (Frobenius norm):
Topic #0: room clean comfortable stayed bed breakfast did night beds check desk quiet day shower water bathroom floor coffee like old
Topic #1: nice breakfast close free experience staff freeway facility amenities pretty pleasant pet wish smoke left eggs definitely large facilities bit
Topic #2: hotel location perfect close florence best sure walk train recommend spacious convenient highly ville visit la station really venue shopping
Topic #3: great place location free breakfast staying parking lots service downtown bike inside stay time people slow bathrooms huge kinda check
Topic #4: friendly staff stay helpful clean comfortable make enjoyed easy recommend super breakfast want place located extremely highly distance attentive beds
Topic #5: excellent staff beds breakfast wedding comfortable location quiet service lovely buffet impeccable accommodating free hard guests levels sight rate party
Topic #6: beautiful rooms classic courteous helpful pl

In [8]:
print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_components=n_components, max_iter=800,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

t0 = time()
lda.fit(tf)
#n_lda.fit(n_tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)




Topics in NMF model (generalized Kullback-Leibler divergence):
Topic #0: room hotel night did bed bathroom stayed clean nice desk didn old asked like quiet coffee check checked went stay
Topic #1: nice breakfast friendly staff clean close helpful free lobby freeway eggs maintained access pretty la convenient amenities buffet easy large
Topic #2: hotel location great staff perfect florence stay best recommend close located look park restaurants walking shopping enjoyed want city amazing
Topic #3: great breakfast free place dirty internet people coffee hour bad good downtown morning cookies kind crappy don check bathrooms hot
Topic #4: clean friendly stay helpful comfortable inn stayed gardner staff super place looking mt accommodations quick awful college enjoyed greg alison
Topic #5: comfortable clean beds breakfast excellent staff good helpful friendly bed accommodating location desk service rooms parking best quality buffet pillows
Topic #6: rooms hotel good just place beautiful fin

In [25]:
from azure.cosmosdb.table.tableservice import TableService
from azure.cosmosdb.table.models import Entity
from azure.cosmosdb.table.tablebatch import TableBatch

table_service = TableService(account_name='knowstoredemo', account_key=STORAGEACCOUNTKEY)
table_name = 'LDAResults'
table_service.create_table(table_name)
def save_top_words(model, feature_names, n_top_words, table_name):
    
    for topic_idx, topic in enumerate(model.components_):
        batch = TableBatch()
        part_key = "Topic-%d" % topic_idx
        
        for i in topic.argsort()[:-n_top_words - 1:-1]:
            row_key = feature_names[i]
            rec = {'PartitionKey': part_key, 'RowKey': row_key,
           'Topic': part_key, 'Keyword': row_key}
            batch.insert_entity(rec)
        table_service.commit_batch('LDAResults', batch)
        print("Inserting batch of rows for " + part_key)

save_top_words(lda, tf_feature_names, n_top_words, table_name)




Inserting batch of rows for Topic-0
Inserting batch of rows for Topic-1
Inserting batch of rows for Topic-2
Inserting batch of rows for Topic-3
Inserting batch of rows for Topic-4
Inserting batch of rows for Topic-5
Inserting batch of rows for Topic-6
Inserting batch of rows for Topic-7
Inserting batch of rows for Topic-8
Inserting batch of rows for Topic-9
