In [6]:
# Import necessary libraries
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

# Read data from an Excel file into a DataFrame
df = pd.read_excel(r'/Users/vishesh/Desktop/ML - Python/project/Git Repository/Bank Reviews-Complaints Analysis/BankReviews.xlsx')

# Drop duplicates based on selected columns
df = df.drop_duplicates(['Stars', 'Reviews', 'BankName'])

# Clean and preprocess text data
df['Reviews'] = df['Reviews'].dropna().apply(lambda x: "".join([c for c in x if c not in string.punctuation]))
df['Reviews'] = df['Reviews'].apply(lambda x: x.lower())
df['Reviews'] = df['Reviews'].apply(lambda x: x.strip().replace("  ", ' ').replace('\r', ' ').replace('\n', ' ').replace('"', ''))
df['Reviews'] = df['Reviews'].apply(lambda x: re.sub(r'[0-9]+|\S+[0-9]\S+\ss', "", x))

# Tokenize text using NLTK
df['Reviews'] = df['Reviews'].apply(lambda x: nltk.word_tokenize(x))

# Remove stopwords using NLTK
df['Reviews'] = df['Reviews'].apply(lambda x: [y for y in x if y not in set(stopwords.words('english'))])

# Lemmatize words using NLTK and WordNet
def get_pos(word):
    w_synsets = wordnet.synsets(word)
    pos_counts = Counter()
    pos_counts["n"] = len([item for item in w_synsets if item.pos() == "n"])
    pos_counts["v"] = len([item for item in w_synsets if item.pos() == "v"])
    pos_counts["a"] = len([item for item in w_synsets if item.pos() == "a"])
    pos_counts["r"] = len([item for item in w_synsets if item.pos() == "r"])
    most_common_pos_list = pos_counts.most_common(3)
    return most_common_pos_list[0][0]

wnl = WordNetLemmatizer()
df['Reviews'] = df['Reviews'].apply(lambda x: [wnl.lemmatize(word, get_pos(word)) for word in x])

# Create bigrams from the tokenized text
bgm_Reviews = df['Reviews'].apply(lambda x: list(nltk.bigrams(x)))

# Join tokenized words back into a string
df['Reviews'] = df['Reviews'].apply(lambda x: ' '.join([y for y in x]))

# Sentiment analysis using TextBlob
def get_bank_sentiment(review):
    analysis = TextBlob(review)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

df['Sentiment'] = df['Reviews'].apply(lambda x: get_bank_sentiment(x))

# Analyze the data
positive_percentage = len(df[df['Sentiment'] == 'positive']) * 100 / len(df)
negative_percentage = len(df[df['Sentiment'] == 'negative']) * 100 / len(df)
neutral_percentage = len(df[df['Sentiment'] == 'neutral']) * 100 / len(df)
df_neg = df[df['Sentiment'] == 'negative']

# Bag of words (BoW) representation
def bow_extractor(corpus):
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

bow_vectorizer, bow_features = bow_extractor(df_neg['Reviews'])
features = bow_features.todense()
feature_names =  bow_vectorizer.get_feature_names()


# TF-IDF representation
def tfidf_extractor(corpus):
    vectorizer = TfidfVectorizer(min_df=1, use_idf=True, norm='l2')
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

tfidf_vectorizer, tfidf_features = tfidf_extractor(df_neg['Reviews'])

# Apply Latent Dirichlet Allocation (LDA)
total_topics = 3
lda = LatentDirichletAllocation(
    n_components=total_topics,
    max_iter=100,
    learning_method='online',
    learning_offset=50,
    random_state=42
)

def display_features(features,features_names):
    data = pd.DataFrame(data=features,columns=feature_names)
    return(data)
    
display_features(features,feature_names)


data1 = display_features(np.round(tfidf_features.todense(),2),feature_names)
lda.fit(data1)
weights = lda.components_

# Define functions for topic extraction and display
def get_topics_terms_weights(weights, feature_names):
    feature_names = np.array(feature_names)
    sorted_indices = np.array([list(row[::-1]) for row in np.argsort(np.abs(weights))])
    sorted_weights = np.array([list(wt[index]) for wt, index in zip(weights, sorted_indices)])
    sorted_terms = np.array([list(feature_names[row]) for row in sorted_indices])
    
    topics = [np.vstack((terms.T, term_weights.T)).T for terms, term_weights in zip(sorted_terms, sorted_weights)]
    return topics

def print_topics_udf(topics, total_topics=1, weight_threshold=0.0001, display_weights=False, num_terms=None):
    for index in range(total_topics):
        topic = topics[index]
        topic = [(term, float(wt)) for term, wt in topic]
        topic = [(word, round(wt, 2)) for word, wt in topic if abs(wt) >= weight_threshold]
        
        if display_weights:
            print('Topic #' + str(index + 1) + ' with weights')
            print(topic[:num_terms] if num_terms else topic)
        else:
            print('Topic #' + str(index + 1) + ' without weights')
            tw = [term for term, wt in topic]
            print(tw[:num_terms] if num_terms else tw)

topics = get_topics_terms_weights(weights, feature_names)

# Display the topics
print_topics_udf(topics=topics, total_topics=total_topics, num_terms=15, display_weights=True)

# Split data into features and labels
bow_vectorizer, bow_features = bow_extractor(df['Reviews'])
features = bow_features.todense()
feature_names = bow_vectorizer.get_feature_names()
tfidf_vectorizer, tfidf_features = tfidf_extractor(df['Reviews'])
data1 = display_features(np.round(tfidf_features.todense(), 2), feature_names)
x = data1
y = df['Stars']

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, random_state=123, test_size=0.3)

# Support Vector Machine (SVM) Classifier
clf = svm.SVC(kernel='linear', C=1.0)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)

# Evaluate the SVM Classifier
accuracy = metrics.accuracy_score(Y_test, y_pred)
confusion_matrix = metrics.confusion_matrix(Y_test, y_pred)
roc_auc = metrics.roc_auc_score(Y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", confusion_matrix)
print("ROC AUC Score:", roc_auc)




Topic #1 with weights
[('process', 0.9), ('ask', 0.87), ('sale', 0.82), ('go', 0.82), ('offer', 0.82), ('xd', 0.75), ('would', 0.74), ('without', 0.73), ('rate', 0.72), ('disorganize', 0.71), ('communicate', 0.71), ('recommendxd', 0.71), ('adam', 0.7), ('apparently', 0.7), ('send', 0.7)]
Topic #2 with weights
[('xd', 1.29), ('recommend', 1.26), ('lender', 1.19), ('close', 1.14), ('home', 1.13), ('would', 1.04), ('register', 1.03), ('site', 1.03), ('new', 1.0), ('previous', 0.98), ('loan', 0.98), ('number', 0.98), ('everything', 0.95), ('mortgage', 0.95), ('unprofessional', 0.94)]
Topic #3 with weights
[('call', 1.47), ('get', 1.25), ('information', 1.17), ('xd', 1.15), ('would', 1.13), ('credit', 1.03), ('close', 1.02), ('say', 1.01), ('document', 0.98), ('send', 0.95), ('situation', 0.93), ('give', 0.9), ('online', 0.89), ('make', 0.87), ('work', 0.83)]
Accuracy: 0.925
Confusion Matrix:
 [[17  7]
 [ 2 94]]
ROC AUC Score: 0.8437499999999998


