In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.sklearn as skvis

%matplotlib inline

In [5]:
kiva = pd.read_csv('derived_data/kiva_02.csv')
kiva.shape

(6817, 31)

## Topic Modeling

#### Split Train and Test for Cleaned Text Feature

In [7]:
X, y = kiva.en, kiva.status
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y,
                                                    random_state=42)

#### Grid Search with Pipeline and 5-fold Cross Validation

In [10]:
# Build topic model pipeline
pipeline = Pipeline([
    ('dtm', CountVectorizer(min_df=10, token_pattern=r'\w{2,}')),
    ('lda', LatentDirichletAllocation(learning_method='online', random_state=42)),
])

# Define search parameters
params = {
    'dtm__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'lda__n_components': [5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25],
    'lda__learning_decay': [0.5, 0.7, 0.9]
}

# Grid search with 5-fold CV
cv = GridSearchCV(pipeline, cv=5, param_grid=params)
cv.fit(X_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('dtm', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...          random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'dtm__ngram_range': [(1, 1), (1, 2), (1, 3)], 'lda__n_components': [5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 25], 'lda__learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
# Best model parameters and performance metric
print("Best model's parameters: ", cv.best_params_)
print("Best log likelihood score: ", cv.best_score_)

Best model's parameters:  {'dtm__ngram_range': (1, 1), 'lda__learning_decay': 0.5, 'lda__n_components': 9}
Best log likelihood score:  -920649.2271950821


#### Visualize Topics with pyLDAvis

First, let's re-build an LDA model with the best parameters.

In [116]:
vect = CountVectorizer(ngram_range=(1, 1),
                       min_df=10,
                       token_pattern=r'\w{2,}')

lda = LatentDirichletAllocation(n_components=9,
                                learning_decay=0.5,
                                learning_method='online',
                                random_state=42)

dtm_train = vect.fit_transform(X_train)
lda.fit(dtm_train)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.5,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=9, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=42, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

Then, visualize the model with t-SNE scaling technique using the handy dandy pyLDAvis package. Topics are non-overlapping and have decent-sized blobs, suggesting our final LDA model is good to use. 

In [118]:
lda_display = skvis.prepare(lda, dtm_train, vect, mds='tsne')
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


#### Top 10 Keywords for Each Topic

In [135]:
# Show top n keywords for each topic
def show_topics(vect=vect, model=lda, n_words=10):
    keywords = np.array(vect.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

# Topic - Keywords Dataframe
topic_keywords = pd.DataFrame(show_topics())
topic_keywords.columns = ['Word '+ str(i) for i in range(1, topic_keywords.shape[1]+1)]
topic_keywords.index = ['Topic '+ str(i) for i in range(1, topic_keywords.shape[0]+1)]
topic_keywords.T

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
Word 1,rice,business,business,business,school,business,rice,group,business
Word 2,salitre,year,year,sell,family,work,farmer,child,stock
Word 3,water,take,use,group,child,sell,baba,member,child
Word 4,month,child,personal,clothe,business,year,land,school,sell
Word 5,farm,start,house,child,request,home,sector,year,able
Word 6,farmer,clothe,old,woman,buy,product,area,usd,one
Word 7,many,work,sell,community,year,help,farm,woman,shop
Word 8,sector,purchase,child,small,expand,husband,bank,cow,start
Word 9,rural,care,information,entrepreneur,income,make,crop,farm,increase
Word 10,high,machine,make,life,sell,buy,part,milk,year


In [127]:
# Create topic distribution dataframe for train
topic_names = ["Topic" + str(i) for i in range(lda.n_components)]
topic_train = pd.DataFrame(lda.transform(dtm_train), columns=topic_names, index=X_train.index)

# Create topic distribution dataframe for test
dtm_test = vect.transform(X_test)
lda.transform(dtm_train)
topic_test = pd.DataFrame(lda.transform(dtm_test), columns=topic_names, index=X_test.index)

In [133]:
topics = topic_train.append(topic_test).sort_index()
topics.shape

(6817, 9)

## Export Dataset with Topics

In [134]:
topics.to_csv('derived_data/kiva_topics.csv')