In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import re
import numpy as np
import pickle
import time
import pandas as pd
import matplotlib.pyplot as plt
import os, sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from preprocess.clean_and_tokenize import clean_and_tokenize_one

[nltk_data] Downloading package punkt to /Users/sueliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Apply Vectorizer Trained on original US data

In [3]:
with open('lda_45_topics.pkl', 'rb') as f:
    data = pickle.load(f)
lda_model = data['model']
vectorizer = data['vectorizer']

In [16]:
def predict_complaint_topics(lda_model, vectorizer, complaints_df, 
                               text_field = 'compliant_text_cleaned',
                               n_topics=45, topic_map=None,
                               top_n=5):
    """
    Predict the top_n topics with probability.
    
    Parameters
    ----------
    lda_model - Latent Dirichlet Model trained on US data
    vectorizer - CountVectorizer trained on US data
    complaint_df - Pandas Dataframe: all complaint info as a Dataframe
    text_field - string: the field name containing complaint text
    
    n_topics - int: number of topics in the US data
    topic_map - dict: topic to description map
    top_n - int: top N topics to be displayed.
    
    Returns
    -------
    dict: {'original_narrative': text, 
           'topics': {topic_idx', 'topic_name', 'topic_prob'}}
    """
    if topic_map is None:
        print('Need topic index to description mapping!!! Stop now and check!!!')
        return
    
    complaints_df['cleaned'] = complaints_df[text_field].apply(clean_and_tokenize_one)

    vectorized = vectorizer.transform(complaints_df['cleaned'])
    topics = lda_model.transform(vectorized)
    
    all_output = []
    for i in range(topics.shape[0]):
        output = dict()
        output['Original narrative'] = complaints_df[text_field][i]
        topic_indices = np.argsort(topics[i, :])[::-1]
        topic_prob = np.sort(topics[i, :])[::-1]
    
        topics_data = {}
        for importance_count, [idx, prob] in enumerate(list(zip(topic_indices, topic_prob))[:top_n]):
            topics_data[importance_count] = {'topic_idx': int(idx), 
                           'topic_name': topic_map[int(idx)],
                           'topic_prob': prob}
        output['topics'] = topics_data
        all_output.append(output)
    return all_output

In [5]:
def gen_topic_map(map_file):
    topic_map = pd.read_excel(map_file).dropna()[['ID', 'Summarized Topic Name']]
    topic_map['ID'] = topic_map['ID'].astype(int)
    topic_map = topic_map.set_index('ID')
    topic_map = list(topic_map.to_dict().values())[0]
    return topic_map

In [6]:
topic_map = gen_topic_map('topics_matching.xlsx')

In [11]:
complaints_df = pd.read_csv('Nationwide_complaints.csv', index_col='Unnamed: 0')

In [17]:
output = predict_complaint_topics(lda_model, vectorizer, complaints_df, 
                               text_field = 'compliant_text_cleaned',
                               n_topics=45, topic_map=topic_map,
                               top_n=5)

### Output to JSON

In [18]:
import json
with open('identified_topics.json', 'w') as fout:
    json.dump(output, fout)

### Read back in

In [19]:
with open('identified_topics.json', 'r') as f:
    outputs = json.load(f)
print(outputs[0])

{'Original narrative': "Beware. Nationwide has a new policy of closing down the accounts of customers who complain a certain amount of times. Nationwide doesn't take into account whether the complaints have been upheld by themselves.", 'topics': {'0': {'topic_idx': 29, 'topic_name': 'Opening or closing account', 'topic_prob': 0.45583688482477036}, '1': {'topic_idx': 35, 'topic_name': 'Insurance and customer protections', 'topic_prob': 0.1490589012907176}, '2': {'topic_idx': 28, 'topic_name': 'Legal complaints', 'topic_prob': 0.07535485718420624}, '3': {'topic_idx': 14, 'topic_name': 'Customer service', 'topic_prob': 0.05519251344301094}, '4': {'topic_idx': 16, 'topic_name': 'Terms and conditions of accounts', 'topic_prob': 0.006455277167451151}}}
