In [112]:
import pandas as pd
from openpyxl import load_workbook

In [113]:
#import excel file
filename_from = "dummy_comment.xlsx"
sheetname_from = "Sheet1"

In [114]:
#put each comment into a list
df = pd.read_excel(filename_from,sheet_name = sheetname_from)
comments = df.comments.tolist()
print(comments)

['I hope the company could provide more annual leaves', 'Fuel up Friday is very useful', 'No email beyond working hours']


In [115]:
#Tokenize each comments
'''
import nltk
nltk.download('punkt')
'''
from nltk.tokenize import word_tokenize
tokenized = [word_tokenize(comment) for comment in comments]
print(tokenized)

[['I', 'hope', 'the', 'company', 'could', 'provide', 'more', 'annual', 'leaves'], ['Fuel', 'up', 'Friday', 'is', 'very', 'useful'], ['No', 'email', 'beyond', 'working', 'hours']]


In [116]:
from collections import Counter

# function that pulls chunks out of chunked sentence and finds the most common chunks
def np_chunk_counter(chunked_sentences):

    # create a list to hold chunks
    chunks = list()

    # for-loop through each chunked sentence to extract noun phrase chunks
    for chunked_sentence in chunked_sentences:
        for subtree in chunked_sentence.subtrees(filter=lambda t: t.label() == 'NP'):
            chunks.append(tuple(subtree))

    # create a Counter object
    chunk_counter = Counter()

    # for-loop through the list of chunks
    for chunk in chunks:
        # increase counter of specific chunk by 1
        chunk_counter[chunk] += 1

    # return 30 most frequent chunks
    return chunk_counter.most_common(30)


In [117]:
import re
#import nltk
#nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
pos_tag_comments = [pos_tag(comment) for comment in tokenized]
print(pos_tag_comments)

[[('I', 'PRP'), ('hope', 'VBP'), ('the', 'DT'), ('company', 'NN'), ('could', 'MD'), ('provide', 'VB'), ('more', 'JJR'), ('annual', 'JJ'), ('leaves', 'NNS')], [('Fuel', 'NNP'), ('up', 'RB'), ('Friday', 'NNP'), ('is', 'VBZ'), ('very', 'RB'), ('useful', 'JJ')], [('No', 'DT'), ('email', 'NN'), ('beyond', 'IN'), ('working', 'VBG'), ('hours', 'NNS')]]


In [118]:
#find verb pharse from each comment
from nltk import RegexpParser, Tree
#vp_chunk_grammar = "VP: {<VB.*><JJ*>*<NN><RB.?>?}"
vp_chunk_grammar = "VP: {<VB.*><JJ.*>*<NN.*><RB.?>?}"
# create RegexpParser object here
vp_chunk_parser = RegexpParser(vp_chunk_grammar)


#Tree.fromstring(str(first_vp)).pretty_print()
vp_chunked_comments = list()
for i in range(len(pos_tag_comments)):
    tree = vp_chunk_parser.parse(pos_tag_comments[i])
    vp_chunked_comments.append((tuple(tree.subtrees(filter=lambda t: t.label() == 'VP'))))

print(vp_chunked_comments)

[(Tree('VP', [('provide', 'VB'), ('more', 'JJR'), ('annual', 'JJ'), ('leaves', 'NNS')]),), (), (Tree('VP', [('working', 'VBG'), ('hours', 'NNS')]),)]


In [119]:
# find all noun pharse
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN*>}"
np_chunk_parser = RegexpParser(np_chunk_grammar)
np_chunked_comments = []

# chunk each sentence and append to chunked_oz
np_chunked_comments = list()
for i in range(len(pos_tag_comments)):
    tree = np_chunk_parser.parse(pos_tag_comments[i])
    np_chunked_comments.append((tuple(tree.subtrees(filter=lambda t: t.label() == 'NP'))))

print(np_chunked_comments)

[(Tree('NP', [('the', 'DT'), ('company', 'NN')]),), (), (Tree('NP', [('No', 'DT'), ('email', 'NN')]),)]


In [120]:
vp_column = []
for x in range(len(vp_chunked_comments)):
    token = []
    if len(vp_chunked_comments[x])==0:
        vp_column.append("")
        continue
    for y in range(len(vp_chunked_comments[x][0])):
        token.append(vp_chunked_comments[x][0][y][0])
    main_idea = " ".join(token)
    vp_column.append(main_idea)
    
df['main_idea_verb_pharse'] = vp_column

np_column = []
for x in range(len(np_chunked_comments)):
    token = []
    if len(np_chunked_comments[x])==0:
        np_column.append("")
        continue
    for y in range(len(np_chunked_comments[x][0])):
        token.append(np_chunked_comments[x][0][y][0])
    main_idea = " ".join(token)
    np_column.append(main_idea)

df['main_idea_noun_pharse'] = np_column
print(df)

                                            comments  \
0  I hope the company could provide more annual l...   
1                      Fuel up Friday is very useful   
2                      No email beyond working hours   

        main_idea_verb_pharse main_idea_noun_pharse  
0  provide more annual leaves           the company  
1                                                    
2               working hours              No email  


In [121]:
#Export to excel
filename_save_to = "dummy_comment2.xlsx"
sheetname_save_to = "nlp_comment"
with pd.ExcelWriter(filename_save_to,
                    mode='w', engine='openpyxl') as writer:  
    df.to_excel(writer, sheet_name=sheetname_save_to, index = False)

In [168]:
#Check NP tree
'''comment_index = 0
tree = np_chunk_parser.parse(pos_tag_comments[comment_index])
Tree.fromstring(str(tree)).pretty_print()'''

                                      S                                               
   ___________________________________|__________________________________              
  |      |        |         |         |         |         |              NP           
  |      |        |         |         |         |         |         _____|______       
I/PRP hope/VBP could/MD provide/VB more/JJR annual/JJ leaves/NNS the/DT     company/NN



In [61]:
# Remove stop words
'''#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_word = []
for comment in tokenized:
    for token in comment:
        temp_cell = []
        if token not in stop_words:
            temp_cell.append(token)
    filtered_word.append(temp_cell)
print(filtered_word)'''

[['leaves'], ['useful'], ['hours']]
