In [1]:
import pandas as pd
import numpy as np

# Import the data and assign description column to process

In [11]:
csv = 'PI_NEWandOLD.csv'
issues_df = pd.read_csv(csv, encoding='latin1',low_memory=False)

In [12]:
issues_df.head()

Unnamed: 0.1,Unnamed: 0,project_id,name,job_number,type,value,city,state_or_province,postal_code,issue_type,issue_subtype,title,description
0,0,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Pre-Punch List,,,Remove nails around dock angles.
1,1,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Pre-Punch List,,,"Grind in a 3/4"" chamfer at all exterior corner..."
2,2,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Architect Punch List,,,Install missing switch and data outlets
3,3,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Architect Punch List,,,Install cover plate on light switch
4,4,01fa4f3a-9969-4069-a5b3-72fb630a0143,Gestamp,102601,Manufacturing / Factory,20549347.0,McCalla,AL,35111,Punch List : Architect Punch List,,,Install base


In [13]:
description_text = issues_df[["description"]]
description_text['index']=description_text.index
documents = description_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
print(len(documents))
print(documents[:30])

159285
                                          description  index
0                   Remove nails around dock angles.       0
1   Grind in a 3/4" chamfer at all exterior corner...      1
2             Install missing switch and data outlets      2
3                 Install cover plate on light switch      3
4                                        Install base      4
5                       Repaint north and south walls      5
6               Complete striping and traffic signage      6
7                                   Clean floor, typ.      7
8          Touch up yellow paint at railing door x118      8
9                   Fix weatherstripping at door x119      9
10  Fill nail holes in all door windows, offices a...     10
11                                    Paint east wall     11
12                                   Paint above door     12
13  Verify no base heater in women's\n\nClayco res...     13
14  Patch top of SE drive in ramp so it is level a...     14
15               

# Data Pre-processing which includes tokenization, removal of stop words and words with fewer than 3 words, lemmatization, stemming

In [15]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zigot\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
stemmer = PorterStemmer()

In [17]:
#Write a function to perform lemmatize and stem preprocessing steps on the data set

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [18]:
#Select a document to preview after preprocessing

doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Remove', 'paint', 'from', 'rubber', 'base', 'left', 'of', 'room', '606']


 tokenized and lemmatized document: 
['remov', 'paint', 'rubber', 'base', 'leav', 'room']


In [24]:
# Pre-process all of the description cells

processed_docs = documents['description'].fillna('').astype(str).map(preprocess)
processed_docs[:30]

0                             [remov, nail, dock, angl]
1     [grind, chamfer, exterior, corner, tilt, panel...
2                  [instal, miss, switch, data, outlet]
3                 [instal, cover, plate, light, switch]
4                                        [instal, base]
5                         [repaint, north, south, wall]
6                     [complet, strip, traffic, signag]
7                                        [clean, floor]
8                    [touch, yellow, paint, rail, door]
9                                                [door]
10              [nail, hole, door, window, offic, mezz]
11                                  [paint, east, wall]
12                                        [paint, door]
13    [verifi, base, heater, women, clayco, respons,...
14    [patch, drive, ramp, level, length, chamfer, c...
15                            [clean, haze, tile, wall]
16     [remov, stick, insul, drywal, deck, draw, paint]
17                [replac, thermax, electr, room

# Bag of Words on the Data Set

In [25]:
dictionary = gensim.corpora.Dictionary(processed_docs)


In [27]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 angl
1 dock
2 nail
3 remov
4 chamfer
5 corner
6 exterior
7 grind
8 insid
9 loung
10 panel
11 parapet
12 tilt
13 trucker
14 upper
15 data
16 instal
17 miss
18 outlet
19 switch
20 cover
21 light
22 plate
23 base
24 north
25 repaint
26 south
27 wall
28 complet
29 signag
30 strip


In [28]:
# filter out tokens that appear in less than 15 documents, more than .5 documents and the most 100000 most frequent tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=10000)

In [29]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 30:
        break

0 angl
1 dock
2 nail
3 remov
4 chamfer
5 corner
6 exterior
7 grind
8 insid
9 loung
10 panel
11 parapet
12 tilt
13 trucker
14 upper
15 data
16 instal
17 miss
18 outlet
19 switch
20 cover
21 light
22 plate
23 base
24 north
25 repaint
26 south
27 wall
28 complet
29 signag
30 strip


In [30]:
#For each cell, we create a dictionary reporting how many words and how many times those words appear
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(3, 1), (23, 1), (35, 1), (63, 1), (155, 1), (384, 1)]

In [31]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 3 ("remov") appears 1 time.
Word 23 ("base") appears 1 time.
Word 35 ("paint") appears 1 time.
Word 63 ("room") appears 1 time.
Word 155 ("leav") appears 1 time.
Word 384 ("rubber") appears 1 time.


In [32]:
# TF-IDF

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.575943665995411),
 (1, 0.5680507148758992),
 (2, 0.5122679097636066),
 (3, 0.28842480398740267)]


# Run LDA using Bag of Words

In [34]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=dictionary, passes=4, workers=2)

In [35]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.166*"open" + 0.077*"instal" + 0.061*"secur" + 0.055*"power" + 0.053*"toilet" + 0.051*"wire" + 0.050*"scratch" + 0.048*"thermostat" + 0.047*"receptacl" + 0.031*"box"
Topic: 1 
Words: 0.350*"floor" + 0.112*"concret" + 0.073*"sink" + 0.051*"hardwar" + 0.048*"instal" + 0.046*"handl" + 0.037*"core" + 0.027*"stick" + 0.022*"need" + 0.020*"faucet"
Topic: 2 
Words: 0.242*"remov" + 0.189*"patch" + 0.119*"hole" + 0.036*"need" + 0.027*"screw" + 0.023*"nail" + 0.022*"mark" + 0.022*"column" + 0.019*"wear" + 0.017*"sand"
Topic: 3 
Words: 0.336*"clean" + 0.108*"unit" + 0.047*"test" + 0.044*"final" + 0.039*"counter" + 0.035*"debri" + 0.029*"temp" + 0.027*"turn" + 0.025*"suppli" + 0.023*"sensor"
Topic: 4 
Words: 0.198*"caulk" + 0.107*"joint" + 0.091*"edg" + 0.074*"sealant" + 0.061*"elev" + 0.047*"gener" + 0.037*"view" + 0.034*"loos" + 0.032*"need" + 0.027*"barricad"
Topic: 5 
Words: 0.131*"door" + 0.111*"adjust" + 0.082*"close" + 0.071*"instal" + 0.062*"stop" + 0.041*"properli" + 0.0

# Run LDA using TF-IDF

In [37]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=4, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.075*"sprinkler" + 0.059*"head" + 0.043*"escutcheon" + 0.026*"instal" + 0.020*"threshold" + 0.019*"lift" + 0.017*"attach" + 0.016*"photo" + 0.015*"draw" + 0.014*"report"
Topic: 1 Word: 0.188*"door" + 0.117*"stop" + 0.086*"gasket" + 0.077*"hardwar" + 0.064*"miss" + 0.063*"instal" + 0.046*"handl" + 0.037*"break" + 0.036*"loos" + 0.035*"lock"
Topic: 2 Word: 0.076*"smoke" + 0.074*"detector" + 0.048*"clean" + 0.040*"room" + 0.038*"cap" + 0.033*"instal" + 0.032*"exist" + 0.032*"cabl" + 0.031*"pictur" + 0.031*"sensor"
Topic: 3 Word: 0.071*"rail" + 0.033*"stair" + 0.028*"instal" + 0.022*"trash" + 0.021*"materi" + 0.017*"need" + 0.016*"hang" + 0.014*"secur" + 0.013*"land" + 0.013*"temp"
Topic: 4 Word: 0.061*"water" + 0.058*"crack" + 0.031*"support" + 0.029*"pip" + 0.028*"pipe" + 0.023*"vent" + 0.021*"connect" + 0.020*"scuff" + 0.019*"valv" + 0.018*"addit"
Topic: 5 Word: 0.170*"drywal" + 0.095*"cabinet" + 0.050*"screw" + 0.038*"offic" + 0.037*"mark" + 0.029*"touchup" + 0.026*"pai

In [38]:
#Performance evaluation 
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))


Score: 0.43638038635253906	 
Topic: 0.263*"touch" + 0.235*"paint" + 0.116*"frame" + 0.081*"door" + 0.063*"wall"

Score: 0.4302833378314972	 
Topic: 0.212*"base" + 0.103*"instal" + 0.071*"floor" + 0.070*"complet" + 0.055*"final"
