### NATURAL LANGUAGE PROCESSING

#### Load the reviews in the form of HTML files

In [1]:
import glob

In [2]:
html_files = glob.glob(r".\BankBazaarData\*.html")

In [3]:
import codecs

In [4]:
S = " "

In [5]:
html_array = []

In [6]:
for file in html_files:
    f = codecs.open(file, "r")
    html_array.append(f.read())

In [7]:
html = S.join(html_array)

In [8]:
html.count("text_here review-desc-more")

496

#### Parse the reviews using BeautifulSoup

In [9]:
from bs4 import BeautifulSoup

In [10]:
parsed_html = BeautifulSoup(html, "html.parser")

In [11]:
reviews = parsed_html.find_all('div', attrs = {'class':"text_here review-desc-more", 'itemprop':"description"})

In [12]:
rating_div = parsed_html.find_all('div', attrs = {'class':"medium-rating rating review-score-container"})
rating = []
for i in range(0, len(rating_div)):
    rating.append(rating_div[i].find('input')['value'])

#### Another way to find rating
rating_li = parsed_html.find_all('li', attrs = {'class':"review-box"}) 

rating=[]

for i in range(0, len(rating_li)):
    rating.append(rating_li[i].select_one('div:nth-of-type(3) > span').select_one('span:nth-of-type(2)').text)

In [13]:
import re

In [14]:
def cleanhtml(raw_html):
    clean = re.compile('<.*?>')
    cleantext = re.sub(clean, '', raw_html)
    return cleantext

In [15]:
review = []
for i in range(0, len(reviews)):
    review.append(cleanhtml(reviews[i].text))

#### Convert data into dataframes

In [16]:
import pandas as pd

In [17]:
Reviews_df = pd.DataFrame({'Reviews':review, 'Ratings':rating})

In [18]:
Reviews_df = Reviews_df.replace({r'\s+$':'', r'^\s+':''}, regex = True).replace(r'\n','',regex=True)

In [19]:
Reviews_df.drop_duplicates(inplace=True)

In [20]:
Reviews_df.head(10)

Unnamed: 0,Reviews,Ratings
0,My kotak mahindra personal loan was taken thro...,4.0
1,VIJAYA Bank offered me a personal loan and the...,5.0
2,I have taken Incred personal loan through onli...,5.0
3,I did not face any issues with Incred. The rat...,4.0
4,Since I'm in need of money so i have taken th...,5.0
5,My personal loan was sanctioned on time by Ta...,4.0
6,I got a personal loan offer from Bajaj finance...,2.0
7,I have applied the personal loan through onlin...,4.0
8,ICICI is taking much more percentage on the in...,2.0
9,HDFC personal loan services are quite good whe...,5.0


In [21]:
Reviews_df.count()

Reviews    483
Ratings    483
dtype: int64

In [22]:
Reviews_df.dtypes

Reviews    object
Ratings    object
dtype: object

In [23]:
Reviews_df['Ratings'] = Reviews_df['Ratings'].astype(float)

In [24]:
Reviews_df.dtypes

Reviews     object
Ratings    float64
dtype: object

#### Calculate Sentiment

In [25]:
def calc_sentiment(reviews):
    if reviews['Ratings']<4.0:
        return 0 #Negative Sentiment
    else:
        return 1 #Positive Sentiment

In [26]:
Reviews_df['Sentiment']= Reviews_df.apply(calc_sentiment, axis=1)

In [27]:
Reviews_df.head(10)

Unnamed: 0,Reviews,Ratings,Sentiment
0,My kotak mahindra personal loan was taken thro...,4.0,1
1,VIJAYA Bank offered me a personal loan and the...,5.0,1
2,I have taken Incred personal loan through onli...,5.0,1
3,I did not face any issues with Incred. The rat...,4.0,1
4,Since I'm in need of money so i have taken th...,5.0,1
5,My personal loan was sanctioned on time by Ta...,4.0,1
6,I got a personal loan offer from Bajaj finance...,2.0,0
7,I have applied the personal loan through onlin...,4.0,1
8,ICICI is taking much more percentage on the in...,2.0,0
9,HDFC personal loan services are quite good whe...,5.0,1


#### Save data in form  of csv

In [28]:
Reviews_df.to_csv("BankBazaar_Reviews.csv")

#### Split the data into training and test set

In [29]:
text, y = Reviews_df.Reviews, Reviews_df.Sentiment

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
text_train, text_test, y_train, y_test = train_test_split(text, y, test_size=0.33, random_state=42)

#### Explore training and test data

In [32]:
import numpy as np

In [33]:
np.unique(y_train)

array([0, 1], dtype=int64)

In [34]:
np.unique(y_test)

array([0, 1], dtype=int64)

In [35]:
print("Samples per Sentiment (training): {}".format(np.bincount(y_train)))

Samples per Sentiment (training): [ 83 240]


In [36]:
print("Samples per Sentiment (testing): {}".format(np.bincount(y_test)))

Samples per Sentiment (testing): [ 30 130]


#### Bag of Words

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer()
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

In [39]:
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(text_train)

In [40]:
tfidf_test_features = tfidf_vectorizer.transform(text_test)

In [41]:
tfidf_train_features

<323x1094 sparse matrix of type '<class 'numpy.float64'>'
	with 14093 stored elements in Compressed Sparse Row format>

In [42]:
tfidf_test_features

<160x1094 sparse matrix of type '<class 'numpy.float64'>'
	with 6724 stored elements in Compressed Sparse Row format>

#### Print the metrics

In [43]:
from sklearn import metrics

In [44]:
def get_metrics(true_labels, predicted_labels):
    print("Accuracy :", np.round(metrics.accuracy_score(true_labels, predicted_labels),2))
    print("Precision:", np.round(metrics.precision_score(true_labels, predicted_labels,average='weighted'),2))
    print("Recall:", np.round(metrics.recall_score(true_labels, predicted_labels, average='weighted'),2))
    print("F1 Score:", np.round(metrics.f1_score(true_labels, predicted_labels, average='weighted'),2))

In [45]:
def train_predict_evaluate_model(classifier, train_features, train_labels, test_features, test_labels):
    classifier.fit(train_features, train_labels)
    predictions =  classifier.predict(test_features)
    get_metrics(true_labels=test_labels, predicted_labels=predictions)
    return predictions

#### Multinomial Naive Bayes with TFIDF Features

In [46]:
from sklearn.naive_bayes import MultinomialNB

In [47]:
mnb_best = MultinomialNB(alpha=0.001, fit_prior=True)

In [48]:
mnb_tfidf_predictions = train_predict_evaluate_model(classifier = mnb_best, train_features=tfidf_train_features, train_labels=y_train, test_features=tfidf_test_features, test_labels = y_test)

Accuracy : 0.76
Precision: 0.72
Recall: 0.76
F1 Score: 0.73


#### SVM with TFIDF Features

In [49]:
from sklearn.svm import SVC

In [50]:
svm_best = SVC(probability=True, kernel='rbf')

In [51]:
svm_tfidf_predictions = train_predict_evaluate_model(classifier = svm_best, train_features=tfidf_train_features, train_labels=y_train, test_features=tfidf_test_features, test_labels = y_test)



Accuracy : 0.81
Precision: 0.66
Recall: 0.81
F1 Score: 0.73


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Confusion Matrix

In [52]:
cm = metrics.confusion_matrix(y_test, mnb_tfidf_predictions)

In [53]:
pd.DataFrame(cm, index=range(0,2), columns = range(0,2))

Unnamed: 0,0,1
0,5,25
1,14,116


In [54]:
cm_svm = metrics.confusion_matrix(y_test, svm_tfidf_predictions)

In [55]:
pd.DataFrame(cm, index=range(0,2), columns = range(0,2))

Unnamed: 0,0,1
0,5,25
1,14,116


#### Incorrect Predictions

In [56]:
print("*****[0 - Negative, 1 - Positive]*****")
k = 0
for document, label, predicted_label in zip(text_test, y_test, mnb_tfidf_predictions):
    for i in range(0,2):
        if label == i and predicted_label != i:
            print("Actual Label:", +label)
            print("Predicted Label:", +predicted_label)
            print("Review:", re.sub('\n','', document))

*****[0 - Negative, 1 - Positive]*****
Actual Label: 1
Predicted Label: 0
Review: I have applied for HDFC Bank personal loan through online .I was unable to avail this loan since don't have current address proof with me .I have not submitted any of my documents to the agent. The service was really good but due to documentation this loan was not processed.
Actual Label: 1
Predicted Label: 0
Review: I was applied a personal loan from HDFC Bank through online. The rate of interest was very high,  according to monthly installment od  17% which is high,  so they  need to decrease it,  that's is the reason I closed this application.  The customer service and responsiveness was good.
Actual Label: 1
Predicted Label: 0
Review: I have approached HDFC Bank for my personal loan through online .I have not availed this loan since don't have current address proof. The rate of interest  and the processing fee was not an issue. I have not submitted any of my documents. There was proper notification fr