In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
data = pd.read_excel('NLP_Data.xlsx')

In [3]:
data.head()

Unnamed: 0,Description of the Grievance,Grievance Category,Grievance SubCategory
0,concerns regarding laboratory tests being bill...,Billing/Financial Dispute,Provider Claim Issues
1,dassatifaction with provider,Quality Of Service,Not Satisfied With Provider Services
2,Dissatisafaction with delay in care.,Access And Availability,Pharmacy
3,Dissatisafaction with Dental provider way of c...,Quality Of Service,Not Satisfied With Provider Services
4,Dissatisfaction for not being informed he had ...,Billing/Financial Dispute,Balance Billing


In [4]:
#checking for the null values

In [5]:
data['Grievance Category'].isnull().sum()
data['Grievance SubCategory'].isnull().sum()

23

In [6]:
# Dropping any NA values in the in the Description

In [7]:
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

In [12]:
# assigning a new column to for the grivenace with encoded values

In [32]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['encoded_Category'] = label_encoder.fit_transform(data['Grievance Category'])

In [33]:
data

Unnamed: 0,Description of the Grievance,Grievance Category,Grievance SubCategory,encoded_Category
0,concern regard laboratori test bill twice,Billing/Financial Dispute,Provider Claim Issues,2
1,dassatifact provid,Quality Of Service,Not Satisfied With Provider Services,8
2,dissatisafact delay care,Access And Availability,Pharmacy,0
3,dissatisafact dental provid way conduct busi,Quality Of Service,Not Satisfied With Provider Services,8
4,dissatisfact inform dental benefit coverag mem...,Billing/Financial Dispute,Balance Billing,2
...,...,...,...,...
494,member appeal charg show offic visit charg tot...,Quality Of Service,Other,8
495,member cost member rx osphena mg tab member pa...,Benefit Package,Other,1
496,mbr mbr dissatisfi say sr horribl commun mbr p...,Quality Of Care,Pcp,7
497,member request appeal member dissatisfi member...,Quality Of Care,Other,7


In [34]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [35]:
def preprocess_text(text):
    #remove special charcters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # tokenization
    tokens = word_tokenize(text.lower())
    
    #stop words removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    # lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    #POS tagging
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = []
    for word, tag in tagged_tokens:
        if tag.startswith('N'):  # Noun
            lemma = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('V'):  # Verb
            lemma = lemmatizer.lemmatize(word, pos='v')
        else:
            lemma = word
        lemmatized_tokens.append(lemma)
    return " ".join(lemmatized_tokens)

In [36]:
data["Description of the Grievance"] = data["Description of the Grievance"].apply(preprocess_text)

In [37]:
data

Unnamed: 0,Description of the Grievance,Grievance Category,Grievance SubCategory,encoded_Category
0,concern regard laboratori test bill twice,Billing/Financial Dispute,Provider Claim Issues,2
1,dassatifact provid,Quality Of Service,Not Satisfied With Provider Services,8
2,dissatisafact delay care,Access And Availability,Pharmacy,0
3,dissatisafact dental provid way conduct busi,Quality Of Service,Not Satisfied With Provider Services,8
4,dissatisfact inform dental benefit coverag mem...,Billing/Financial Dispute,Balance Billing,2
...,...,...,...,...
494,member appeal charg show offic visit charg tot...,Quality Of Service,Other,8
495,member cost member rx osphena mg tab member pa...,Benefit Package,Other,1
496,mbr mbr dissatisfi say sr horribl commun mbr p...,Quality Of Care,Pcp,7
497,member request appeal member dissatisfi member...,Quality Of Care,Other,7


In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [84]:
X_train, y_train = data['Description of the Grievance'].tolist(), data['encoded_Category'].tolist()
#X_val_norm, y_val = data['Description of the Grievance'].tolist(), data['encoded_Category'].tolist()
X_test, y_test = data['Description of the Grievance'].tolist(), data['encoded_Category'].tolist()

In [90]:
# vectorizing the data
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVec = TfidfVectorizer(ngram_range = (1, 1))
X_train_tfidf = TfidfVec.fit_transform(X_train)
X_test_tfidf = TfidfVec.transform(X_test)

In [91]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn import svm

In [92]:
names = [
    "Logistic Regression",
    "KNN Classifier",
    "Decision Tree",
    "Random Forest",
    "Linear SVM",
]

models = [
    LogisticRegression(max_iter = 1000),
    KNeighborsClassifier(n_neighbors = 149, n_jobs = -1),
    DecisionTreeClassifier(),
    RandomForestClassifier(max_depth=100,max_features=100),
    svm.SVC(kernel = 'linear'),
]

In [95]:
def score(X_train, y_train, X_val, y_val, names = names, models = models):
    score_df, score_train, score_val = pd.DataFrame(), [], []
    x = time.time()
    for model in models:
        model.fit(X_train, y_train)
        y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
        score_train.append(accuracy_score(y_train, y_train_pred))
        score_val.append(accuracy_score(y_val, y_val_pred))
    
    score_df["Classifier"], score_df["Testing accuracy"] = names, score_val
    return score_df

In [96]:
score(X_train_tfidf, y_train, X_test_tfidf, y_test, names = names, models = models)

Unnamed: 0,Classifier,Testing accuracy
0,Logistic Regression,0.807615
1,KNN Classifier,0.46493
2,Decision Tree,0.995992
3,Random Forest,0.995992
4,Linear SVM,0.887776
