In [2]:
import pandas as pd
import numpy as np
import re, nltk, spacy, gensim


# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

csv = 'PI_NEWandOLD.csv'
issues_df = pd.read_csv(csv, encoding='latin1', low_memory=False )

In [2]:
issues_df

Unnamed: 0.1,Unnamed: 0,project_id,name,job_number,type,value,city,state_or_province,postal_code,issue_type,issue_subtype,title,description
0,0,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Pre-Punch List,,,Remove nails around dock angles.
1,1,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Pre-Punch List,,,"Grind in a 3/4"" chamfer at all exterior corner..."
2,2,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Architect Punch List,,,Install missing switch and data outlets
3,3,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Architect Punch List,,,Install cover plate on light switch
4,4,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Architect Punch List,,,Install base
...,...,...,...,...,...,...,...,...,...,...,...,...,...
160684,59079,69bb62b9-8abd-4b2d-8393-abb625f1023c,10104186 - Project Kodiak,10104186,Warehouse (non-manufacturing),26748673.0,Salisbury,NC,28146,Observation,Safety,Fuel stored next to Natural gas Trailer,Owned by Chewy Sub but still needs to be removed.
160685,59080,69bb62b9-8abd-4b2d-8393-abb625f1023c,10104186 - Project Kodiak,10104186,Warehouse (non-manufacturing),26748673.0,Salisbury,NC,28146,Quality,Workmanship,Plywood Enclosure needs some attention,
160686,59081,69bb62b9-8abd-4b2d-8393-abb625f1023c,10104186 - Project Kodiak,10104186,Warehouse (non-manufacturing),26748673.0,Salisbury,NC,28146,Quality,Workmanship,Replace Damaged base course asphalt,Need a map created to make sure we pick up all...
160687,59082,69bb62b9-8abd-4b2d-8393-abb625f1023c,10104186 - Project Kodiak,10104186,Warehouse (non-manufacturing),26748673.0,Salisbury,NC,28146,Quality,Quality,Concrete Acid etched,Need to come up with a plan to repair the slab...


In [3]:
issues_df = issues_df.dropna(subset=['description'])

In [4]:
# Convert to list
data = issues_df.description.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
pprint(data[:1])

['Remove nails around dock angles. ']


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['remove', 'nails', 'around', 'dock', 'angles']]


In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [7]:
# Initialize spacy ‘en’ model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

['remove nail dock angle', 'grind chamfer corner tilt panel corner parapet corner trucker lounge parapet detail']


In [8]:
vectorizer = CountVectorizer(analyzer='word', min_df=10,stop_words='english', lowercase=True, token_pattern='[a-zA-Z0-9]{3,}', max_features = 5000) 
data_vectorized = vectorizer.fit_transform(data_lemmatized)           

In [9]:
lda_model = LatentDirichletAllocation(n_components=10, max_iter = 10, learning_method = 'online', random_state =100, batch_size = 50, evaluate_every = -1, n_jobs = -1)

In [10]:
lda_output = lda_model.fit_transform(data_vectorized)

In [11]:
print(lda_model)

LatentDirichletAllocation(batch_size=50, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)
The history saving thread hit an unexpected error (OperationalError('disk I/O error',)).History will not be written to the database.


In [12]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -3644956.9656807533
Perplexity:  641.0392796754181
{'batch_size': 50,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'online',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': -1,
 'perp_tol': 0.1,
 'random_state': 100,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}


In [None]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)



In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

In [None]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ['Topic' + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ['Doc' + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords