In [1]:
#importing liraries
import pandas as pd
import numpy as np
#import math as m
#import scipy as sc
import re

import nltk
from nltk.corpus import stopwords, words

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from textblob import TextBlob, Word
# NLTK stemmers
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
#import string
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
# create an object of class TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [2]:
stop_words = nltk.corpus.stopwords.words('english')

In [3]:
# Define function to lemmatize each word with its POS tag
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
    return " ".join(lemmatized_list)

# # Lemmatize
# sentence = "The striped bats are hanging on their feet for best"
# lemmatize_with_postag(sentence)
# #> 'The striped bat be hang on their foot for best'

In [4]:
# A lot of acronyms will be presumably captured by n=3
# longer ngrams, such as n=6, retain more of the word
def ngramsX(string, n):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [5]:
#create an object of class PorterStemmer
porter = PorterStemmer()

# #create an object of class LancasterStemmer
# lancaster=LancasterStemmer()

#create an object of class SnowballStemmer
snowball = SnowballStemmer(language='english')

In [6]:
def preprocess_list(mylist):
    
    sentences = mylist.copy()
    sentences_index = 0  
    
    for sentence in sentences:
        #print('') # Diagnostic
        #print(sentence)
        
        # -------------------------------------
        # A. Basic pre-processing, regex-style
        # -------------------------------------
        
        # create word tokens
        xd = nltk.word_tokenize(sentence)
        
        # lower case. NLTK lemmatizer is case-sensitive
        xd = [str.lower(word) for word in xd]
        
        # Do NOT split terms like 'r/t', 'f/u' ...
        # ... but DO split compound words such as 'cancer/colostomy'
        xd = [word.split("/") for word in xd if (len(word) >= 3) ]
        
        # The above turns xd into a list of lists, so flatten it
        xd = [item for sublist in xd for item in sublist]
        
        # Exclude single-characters and stop-words from word tokens
        xd = [word for word in xd if (len(word) > 1) & (not word in stop_words) ]
        # Still retained - words such as, 'r/t', 'f/u', 'pt', 'dx', 'tx', 'rx', "'s", "mass/", '2017'

        # Remove non-ASCII
        xd = [re.sub(r'[^\x00-\x7F]+',' ', word) for word in xd]

        # Remove words that start with numbers
        xd = [word for word in xd if not re.search('^[0-9]', word) ]
        #print(xd) # Diagnostic
        
        # -------------------------------------
        # B. NLP lemmatization & stemming
        # -------------------------------------
        
        # Lemmatize a Sentence with the appropriate POS tag
        lemmatized_sentence = [lemmatize_with_postag(word)for word in xd]
        #print(lemmatized_sentence)  #Diagnostic
        
        # Porter, Lancaster, and SnowBall stemmers
        xd_porter_stemmed = [porter.stem(word) for word in lemmatized_sentence]
        #xd_lancaster_stemmed = [lancaster.stem(word) for word in lemmatized_sentence]
        xd_snowball_stemmed = [snowball.stem(word) for word in lemmatized_sentence]
        #print(xd_porter_stemmed)
        
        
        # Feature space w/ ngrams
        ll = [ lemmatized_sentence, 
              xd_porter_stemmed, #xd_lancaster_stemmed, 
              xd_snowball_stemmed,
              ngramsX(' '.join(xd),3), ngramsX(' '.join(xd),4), ngramsX(' '.join(xd),6)
             ]
        
        # Flatten the list of lists, ll
        flat_list = [item for sublist in ll for item in sublist]
        
        sentence_2 = ' '.join(flat_list)
        #print(sentence_2)
        
        # Update list element
        sentences[sentences_index] = sentence_2
        
        # Increment counter
        sentences_index = sentences_index + 1
        
    return(sentences)

In [7]:
#get matches for problems with threshold 0.4
def get_matches_oppo(search,data,action_list):
   
    # Both sample pre processing
    o2 = preprocess_list(search)
    p2 = preprocess_list(data)
    
    docs = [item for sublist in [o2,p2] for item in sublist]
    tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

    # #cosine similarity
    # # The diagonal elements of the cosine similarity matrix equal 1 (cosine similarity of a string to itself)
    # # We don't care for those, simply subtract them out using np.eye()  
    cs = (cosine_similarity(tfidf_matrix, tfidf_matrix) - np.eye(tfidf_matrix.shape[0])).round(3)
    # create an Empty DataFrame object 
    df2 = pd.DataFrame()
    # Create rows
    df2['Problems/deficit'] = search
    # Fill-in the remaining columns
    col_index = 0
    for i in data:
        df2[i] = cs[0:len(o2) , (len(o2) + col_index):((len(o2) + col_index)+1)]
        col_index = col_index + 1


    row_list1=action_list
    row_list_index=[i for i in range(len(row_list1))]

    row_dict_map=dict(zip(row_list_index,row_list1))
    
    
    #threshold
    df2=df2[df2.iloc[:,1:len(df2.iloc[0])] > 0.4]

    final_data=  pd.DataFrame(df2[df2.iloc[:,1:len(df2.iloc[0])]!=0].stack())

    final_data.reset_index(inplace=True)
    final_data.columns=['index','Opportunities','Score']
#     final_data.set_index('index')
    final_data['actions']=final_data['index'].map(row_dict_map)
    #return final_data
#     final_data.reset_index(inplace=True)
#     final_data.columns=['Opportunities','Cross_Opportunities','Score']
    return final_data.sort_values(by=['Score'], ascending=False)