In [1]:
import pandas as pd
from rake_nltk import Rake

In [2]:
incidents = pd.read_excel('incidents_with_problems.xlsx')

### Show example of keyword extraction

In [4]:
# incidents = incidents.dropna(axis=1)
incidents.iloc[0]['Description']

"<EXECUTIVE NOTIFY: FEDEX-SEV4>\n06/17/18 01:04 MOC / GOC and PIT Phone Turret Phone issue INCIDENT START MOC (Maintenance Operation Center), GOC (Global Operations Center) and PITT users are experiencing issues with their new Turret phones. The issue is impacting PIT customers ability to place orders and GOC's ability to place or receive calls out of the Control Center. Per MOC some ACARS dependencies will are being impacted as well. VOIP support teams have been working to resolve the issue and a high level case has been opened with CISCO-TAC. The issue began at 22:00 CDT. The IT Command Center was informed at 22:46 CDT.\n"

In [5]:
r = Rake(min_length=2, max_length=8)
keywords = r.extract_keywords_from_text(incidents.iloc[0]['Description'])
# incidents.iloc[0]['Description']

In [6]:
r.get_ranked_phrases()

['pit phone turret phone issue incident start moc',
 'maintenance operation center ), goc',
 'impacting pit customers ability',
 'new turret phones',
 'voip support teams',
 'high level case',
 'global operations center',
 'issue began',
 'per moc',
 '04 moc',
 'control center',
 'command center',
 'receive calls',
 'place orders',
 'pitt users',
 'experiencing issues',
 'executive notify',
 'acars dependencies',
 '46 cdt',
 '18 01',
 '00 cdt']

#### Define function to map Description feature to Keywords
After some playing around, min_length 2 and max_length 8 for the keyword phrases seemed to work the best with the Descriptions in the dataset.
I also chose to make a list of the top 4 keyword phrases because many of the ranked phrases did not have a desirable phrase as the first item, but most from what I saw did by the 4th item.
These both may need to be adjusted later and there are other parameters such as using different stop word dictionaries that can be tweaked later.

In [7]:
def extract_keywords(text):
    r = Rake(min_length=2, max_length=8)
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[0:4]

### Add Keywords feature extracting from Description

In [8]:
incidents.insert(loc=2, column='Keywords', value=incidents['Description'].apply(extract_keywords))

# desc_data = full_data[['Number', 'Description']]
# desc_data.columns = desc_data.columns.str.replace('Problem', 'Number')
incidents.head()

Unnamed: 0,Number,Problem,Keywords,Active,Activity due,Additional assignee list,Approval,Approval history,Approval set,Assigned to,...,Awareness (Min),CHECKOUT (MIN),ESCALATION (MIN),REMEDIATION (MIN),TIMELINE DURATION (MIN),TRIAGE (MIN),Acknowledge (Min),BSD call,New Call,Vendor Exception
0,INC010068680,PRB0060003,[pit phone turret phone issue incident start m...,False,NaT,,Not Yet Requested,,,,...,,,,,,,,,,
1,INC010000197,PRB0060003,[pit phone turret phone issue incident start m...,False,NaT,,Not Yet Requested,,,Kevin Moore,...,66.0,0.0,0.0,14.0,362.0,282.0,66.0,,,
2,INC010003952,PRB0060019,[multiple locations reporting user login issue...,False,NaT,,Not Yet Requested,,,Kevin Mathis,...,1.0,0.0,60.0,0.0,101.0,40.0,1.0,,,
3,INC010269074,PRB0060026,[multiple customer service systems including c...,False,NaT,,Not Yet Requested,,,Rochelle Coleman,...,,,,,,,,,,
4,INC010269078,PRB0060026,[multiple customer service systems including c...,False,NaT,,Not Yet Requested,,,Rochelle Coleman,...,,,,,,,,,,


In [9]:
num_problems = incidents[['Number', 'Problem']]

In [10]:
prob_count = num_problems['Problem'].value_counts()
num_problems = num_problems[num_problems['Problem'].isin(prob_count[prob_count > 1].index)]
num_problems['Problem'].value_counts()

PRB0060187    24
PRB0062159    15
PRB0060026    13
PRB0060758    10
PRB0060146    10
PRB0060985     9
PRB0060120     9
PRB0060147     6
PRB0062412     5
PRB0060990     5
PRB0061073     4
PRB0061956     4
PRB0062104     4
PRB0062432     4
PRB0062207     4
PRB0062358     4
PRB0061373     4
PRB0060194     4
PRB0062384     4
PRB0062320     4
PRB0060188     3
PRB0061448     3
PRB0061283     3
PRB0060718     3
PRB0060075     3
PRB0061315     3
PRB0062186     3
PRB0063084     3
PRB0062329     3
PRB0062222     3
              ..
PRB0061590     2
PRB0062744     2
PRB0061386     2
PRB0061163     2
PRB0061539     2
PRB0062125     2
PRB0060414     2
PRB0061547     2
PRB0061774     2
PRB0061387     2
PRB0062011     2
PRB0061454     2
PRB0063184     2
PRB0062901     2
PRB0061856     2
PRB0060251     2
PRB0062467     2
PRB0061853     2
PRB0061940     2
PRB0061450     2
PRB0061593     2
PRB0061988     2
PRB0061460     2
PRB0061453     2
PRB0061270     2
PRB0060003     2
PRB0062590     2
PRB0061035    