In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('processed_dataset.csv')
df['Customer Feedback']

0                  My Credit Card is not generating OTP.
1              The Current Account charges are too high.
2              The loan prepayment charges are too high.
3      The Savings Account opening process was very s...
4      The loan documentation process is very complic...
                             ...                        
994    The Current Account customer service is very p...
995    The Current Account opening process was very e...
996     The branch staff are very courteous and helpful.
997                 How do I activate my new Debit Card?
998    The branch is not accessible for disabled cust...
Name: Customer Feedback, Length: 999, dtype: object

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Text cleaning
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Stopwords removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

df['Cleaned Feedback'] = df['Customer Feedback'].apply(preprocess_text)

# Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['Cleaned Feedback'])


[nltk_data] Downloading package punkt to /Users/mohan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /Users/mohan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/mohan/nltk_data...


In [5]:
df.head()

Unnamed: 0,Gender,Senior Citizen,Marital Status,Dependents,tenure in months,Priority Account,Credit Cards,Loan Account,Netbanking,Debit Card,...,FDs,Interest Deposited,Paperless Banking,Monthly Average Balance (USD),Yearly Average Balance (USD),Churn,Customer Feedback,Category,Recommendation,Cleaned Feedback
0,1,0,0,1.0,1.0,1.0,0.0,-1.0,0.0,0,...,0.0,2,0.0,44.0,44.0,0.0,My Credit Card is not generating OTP.,0.0,,credit card generating otp
1,1,0,1,1.0,72.0,0.0,0.0,-1.0,1.0,1,...,1.0,0,1.0,64.8,4719.75,0.0,The Current Account charges are too high.,1.0,,current account charge high
2,0,1,0,0.0,20.0,1.0,1.0,-1.0,0.0,1,...,1.0,2,1.0,94.1,1782.4,1.0,The loan prepayment charges are too high.,2.0,,loan prepayment charge high
3,0,0,0,0.0,47.0,1.0,1.0,-1.0,1.0,0,...,0.0,0,1.0,65.0,2879.9,0.0,The Savings Account opening process was very s...,3.0,,saving account opening process smooth
4,0,0,1,1.0,54.0,1.0,1.0,-1.0,1.0,1,...,1.0,2,1.0,104.1,5645.8,0.0,The loan documentation process is very complic...,2.0,,loan documentation process complicated


In [6]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']

df['Sentiment Score'] = df['Customer Feedback'].apply(analyze_sentiment)
df['Sentiment'] = df['Sentiment Score'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mohan/nltk_data...


In [7]:
df.head()

Unnamed: 0,Gender,Senior Citizen,Marital Status,Dependents,tenure in months,Priority Account,Credit Cards,Loan Account,Netbanking,Debit Card,...,Paperless Banking,Monthly Average Balance (USD),Yearly Average Balance (USD),Churn,Customer Feedback,Category,Recommendation,Cleaned Feedback,Sentiment Score,Sentiment
0,1,0,0,1.0,1.0,1.0,0.0,-1.0,0.0,0,...,0.0,44.0,44.0,0.0,My Credit Card is not generating OTP.,0.0,,credit card generating otp,0.3818,positive
1,1,0,1,1.0,72.0,0.0,0.0,-1.0,1.0,1,...,1.0,64.8,4719.75,0.0,The Current Account charges are too high.,1.0,,current account charge high,-0.2732,negative
2,0,1,0,0.0,20.0,1.0,1.0,-1.0,0.0,1,...,1.0,94.1,1782.4,1.0,The loan prepayment charges are too high.,2.0,,loan prepayment charge high,-0.2732,negative
3,0,0,0,0.0,47.0,1.0,1.0,-1.0,1.0,0,...,1.0,65.0,2879.9,0.0,The Savings Account opening process was very s...,3.0,,saving account opening process smooth,0.0,neutral
4,0,0,1,1.0,54.0,1.0,1.0,-1.0,1.0,1,...,1.0,104.1,5645.8,0.0,The loan documentation process is very complic...,2.0,,loan documentation process complicated,0.0,neutral


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Keyword Extraction using CountVectorizer
vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_counts = vectorizer.fit_transform(df['Cleaned Feedback'])

# Topic Modeling using LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X_counts)

# Display top words in each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, vectorizer.get_feature_names_out(), 10)

Topic 0:
account saving saving account current current account high service online balance atm
Topic 1:
card debit debit card process high branch loan pin card pin confusing
Topic 2:
account loan process rate interest rate interest current current account deposit saving
Topic 3:
card credit credit card banking mobile app support banking app mobile banking customer
Topic 4:
account process loan saving account saving loan documentation documentation process documentation statement account statement
