In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [1]:
import re
import numpy as np
import pickle
import time
import pandas as pd
import matplotlib.pyplot as plt
import os, sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from preprocess.clean_and_tokenize import clean_and_tokenize_one

[nltk_data] Downloading package punkt to /Users/sueliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def gen_topic_map(map_file):
    """Read topic to description map file as an Excel file and convert to dictionary
    """
    topic_map = pd.read_excel(map_file).dropna()[['ID', 'Summarized Topic Name']]
    topic_map['ID'] = topic_map['ID'].astype(int)
    topic_map = topic_map.set_index('ID')
    topic_map = list(topic_map.to_dict().values())[0]
    return topic_map

In [44]:
def predict_complaint_topics(lda_model, vectorizer, complaints_df, 
                               text_field = 'compliant_text_cleaned',
                               n_topics=45, topic_map=None,
                               top_n=5):
    """
    Predict the top_n topics with probability.
    
    Parameters
    ----------
    lda_model - Latent Dirichlet Model trained on US data
    vectorizer - CountVectorizer trained on US data
    complaint_df - Pandas Dataframe: all complaint info as a Dataframe
    text_field - string: the field name containing complaint text
    
    n_topics - int: number of topics in the US data
    topic_map - dict: topic to description map
    top_n - int: top N topics to be displayed.
    
    Returns
    -------
    dict: {'original_narrative': text, 
           'topics': {topic_idx', 'topic_name', 'topic_prob'}}
    """
    if topic_map is None:
        print('Need topic index to description mapping!!! Stop now and check!!!')
        return
    
    complaints_df['cleaned'] = complaints_df[text_field].apply(clean_and_tokenize_one)

    vectorized = vectorizer.transform(complaints_df['cleaned'])
    topics = lda_model.transform(vectorized)
    
    all_output = []
    for i in range(topics.shape[0]):
        output = dict()
        output['Original narrative'] = complaints_df[text_field][i]
        topic_indices = np.argsort(topics[i, :])[::-1]
        topic_prob = np.sort(topics[i, :])[::-1]
    
        topics_data = {}
        for importance_count, [idx, prob] in enumerate(list(zip(topic_indices, topic_prob))[:top_n]):
            topics_data[importance_count] = {'topic_name': topic_map[int(idx)],
                                             'topic_prob': prob}
        output['topics'] = topics_data
        all_output.append(output)
    return all_output

In [57]:
!python LDA_identify_topics.py

[nltk_data] Downloading package punkt to /Users/sueliu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Apply Vectorizer Trained on original US data

In [3]:
with open('lda_45_topics.pkl', 'rb') as f:
    data = pickle.load(f)
lda_model = data['model']
vectorizer = data['vectorizer']

topic_map = gen_topic_map('topics_matching.xlsx')

In [45]:
complaints_df = pd.read_csv('Nationwide_complaints.csv', index_col='Unnamed: 0')

In [46]:
output = predict_complaint_topics(lda_model, vectorizer, complaints_df, 
                               text_field = 'compliant_text_cleaned',
                               n_topics=45, topic_map=topic_map,
                               top_n=5)

### Output to JSON

In [51]:
import json
with open('nationwide_identified_topics.json', 'w') as fout:
    json.dump(output, fout)

### Read back in

In [58]:
with open('nationwide_identified_topics.json', 'r') as f:
    outputs = json.load(f)

In [60]:
idx = 35
print(outputs[idx]['Original narrative'])
pd.DataFrame.from_dict(outputs[idx]['topics']).T

Absolutely awful bank! I have moved my account back to Santander who were so much better. I only moved over to try to gain advantage from the new account offer and savings account interest.Nationwide have done absolutely nothing to help resolve an issue I had with an online fraudulent transaction, other than waste my time by telling me that I could not report the transaction for 30 days, then they wasted my time by requesting the same information which I had already submitted over and over again..and now finally they have closed the dispute because it has gone over the time they allot to investigating a fraudulent transactionTheir customer service is an absolute farce and utter disgrace! I intend to report them to all the relevant authorities.Whatever you do, do not open an account with them. They are not interested in your wellbeing at all.DISGRACEFUL!


Unnamed: 0,topic_name,topic_prob
0,Customer service,0.2694
1,Opening or closing account,0.22374
2,Fraudulent transaction,0.15004
3,Customer support,0.135509
4,Interest rates,0.0571193
