In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from textblob import TextBlob
import matplotlib.pyplot as plt

In [2]:
# Import the pandas library as pd
import pandas as pd

# Read the CSV file 'filtered_data_new.csv' into a DataFrame 'df'
df = pd.read_csv('filtered_data_new.csv')

# Combine the 'Consumer complaint narrative' column into a single text for each row
# The 'apply' function is used to apply a lambda function to each row
# The lambda function joins the non-null values in the row into a single string with a space (' ') as the separator
# The result is stored in a new column 'combined_text' in the DataFrame
df['combined_text'] = df[['Consumer complaint narrative']].apply(lambda x: ' '.join(x.dropna()), axis=1)

In [3]:
def extract_keywords(data, num_features=50):
    """
    This function extracts the top 'num_features' keywords from the given text data using the TF-IDF vectorization method.
    
    Parameters:
    data (pandas.Series): The text data from which to extract keywords.
    num_features (int): The number of top features to extract. Default is 50.

    Returns:
    df_tfidf (pandas.DataFrame): A DataFrame where each row corresponds to a document in 'data' and each column corresponds to a keyword. The value in each cell is the TF-IDF score of the keyword in the corresponding document.
    """
    
    # Instantiate a TfidfVectorizer object with the given number of features and English stop words
    tfidf_vectorizer = TfidfVectorizer(max_features=num_features, stop_words=stopwords.words('english'))
    
    # Fit the TfidfVectorizer to the data and transform the data into a TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(data)
    
    # Get the feature names (keywords) from the TfidfVectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Convert the sparse TF-IDF matrix to a dense matrix
    dense = tfidf_matrix.todense()
    
    # Convert the dense matrix to a list
    denselist = dense.tolist()
    
    # Convert the list to a DataFrame with the feature names as columns
    df_tfidf = pd.DataFrame(denselist, columns=feature_names)
    
    return df_tfidf

In [4]:
# Call the 'extract_keywords' function on the 'combined_text' column of the DataFrame 'df'
# The function extracts the top 50 keywords from the given text data using the TF-IDF vectorization method
# The result is a DataFrame where each row corresponds to a document in 'combined_text' and each column corresponds to a keyword
# The value in each cell is the TF-IDF score of the keyword in the corresponding document
# The result is stored in the variable 'tfidf_keywords'
tfidf_keywords = extract_keywords(df['combined_text'])

In [5]:
def analyze_sentiment(text):
    """
    This function calculates the sentiment polarity of the given text using the TextBlob library.

    Parameters:
    text (str): The text for which to calculate the sentiment polarity.

    Returns:
    float: The sentiment polarity of the text. This is a value between -1 (negative sentiment) and 1 (positive sentiment).
    """
    return TextBlob(str(text)).sentiment.polarity

In [6]:
# Apply the 'analyze_sentiment' function to the 'Consumer complaint narrative' column of the DataFrame 'df'
# The function calculates the sentiment polarity of the given text using the TextBlob library
# The result is a value between -1 (negative sentiment) and 1 (positive sentiment)
df['sentiment'] = df['Consumer complaint narrative'].apply(analyze_sentiment)

# Group the DataFrame 'df' by the 'Product' column and calculate the mean of the 'sentiment' column for each group
# The result is a Series where the index is the unique values of the 'Product' column and the values are the mean sentiment scores

product_sentiment = df.groupby('Product')['sentiment'].mean()

# Sort the 'product_sentiment' Series in ascending order
# The result is a Series where the index is the unique values of the 'Product' column and the values are the mean sentiment scores, sorted in ascending order
# The result is stored in the variable 'worst_products'
worst_products = product_sentiment.sort_values()

In [7]:
def categorize_sentiment(text):
    """
    This function categorizes the sentiment of the given text using the TextBlob library.

    Parameters:
    text (str): The text for which to categorize the sentiment.

    Returns:
    str: The category of the sentiment. This can be 'Positive', 'Negative', or 'Neutral'.
    """
    sentiment = TextBlob(str(text)).sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the 'categorize_sentiment' function to the 'Consumer complaint narrative' column of the DataFrame 'df'
# The function categorizes the sentiment of the given text using the TextBlob library
# The result is a string representing the category of the sentiment. This can be 'Positive', 'Negative', or 'Neutral'
# The result is stored in a new column 'sentiment_category' in the DataFrame
df['sentiment_category'] = df['Consumer complaint narrative'].apply(categorize_sentiment)

In [8]:
from textblob import TextBlob

# Function to calculate sentiment score using TextBlob
def calculate_textblob_sentiment(text):
    """
    This function calculates the sentiment polarity of the given text using the TextBlob library.

    Parameters:
    text (str): The text for which to calculate the sentiment polarity.

    Returns:
    float: The sentiment polarity of the text. This is a value between -1 (negative sentiment) and 1 (positive sentiment).
    """
    # TextBlob sentiment analysis
    testimonial = TextBlob(text)
    return testimonial.sentiment.polarity  # Returns a value between -1 and 1

# Apply the 'calculate_textblob_sentiment' function to the 'Company public response' column of the DataFrame 'df'
df['company_public_response_sentiment'] = df['Company public response'].apply(calculate_textblob_sentiment)

In [9]:
def identify_company_responses_for_each_sentiment(df):
    """
    This function identifies the unique company responses for each unique sentiment score in the DataFrame.

    Parameters:
    df (pandas.DataFrame): The DataFrame containing the sentiment scores and company responses.

    The function prints the unique company responses for each unique sentiment score.
    """
    # Extract all unique sentiment scores
    unique_sentiment_scores = df['company_public_response_sentiment'].unique()

    # For each unique score, identify the unique company responses assigned to it
    for score in unique_sentiment_scores:
        # Filter the DataFrame based on the sentiment score
        filtered_df = df[df['company_public_response_sentiment'] == score]

        # Select the unique 'Company public response'
        unique_company_responses = filtered_df['Company public response'].unique()

        # Print the unique company responses
        print(f"Unique company responses with a sentiment score of {score}:\n", unique_company_responses)


identify_company_responses_for_each_sentiment(df)

Unique company responses with a sentiment score of 0.0:
 ['Company has responded to the consumer and the CFPB and chooses not to provide a public response']
Unique company responses with a sentiment score of 0.5:
 ['Company believes it acted appropriately as authorized by contract or law']
Unique company responses with a sentiment score of -0.3:
 ['Company believes complaint is the result of an isolated error'
 "Company can't verify or dispute the facts in the complaint"
 'Company believes the complaint is the result of a misunderstanding'
 'Company disputes the facts presented in the complaint'
 "Company believes the complaint provided an opportunity to answer consumer's questions"
 'Company believes complaint relates to a discontinued policy or procedure']
Unique company responses with a sentiment score of 0.1:
 ['Company believes complaint represents an opportunity for improvement to better serve consumers']
Unique company responses with a sentiment score of -0.09999999999999999:
 [

In [10]:
#Sentiment analysis for company response to consumer 
# Apply the function to the "Company public response" column
df['company_consumer_response_sentiment'] = df['Company response to consumer'].apply(calculate_textblob_sentiment)



In [11]:
'''Because Textblob gives the sentiment score the same for all the responses, we will manually assign binary values when creating the function to calculate the satisfaction score.'''

df['Company response to consumer'].value_counts()

Company response to consumer
Closed with explanation            37300
Closed with non-monetary relief    14784
Closed with monetary relief         3415
Name: count, dtype: int64

# Calculating Satisfaction Score

In [12]:

# If the value is 'Closed with monetary relief', the function returns 1; otherwise, it returns 0
# The result is stored in a new column 'response_to_consumer_score' in the DataFrame
df['response_to_consumer_score'] = df['Company response to consumer'].apply(lambda x: 1 if x == 'Closed with monetary relief' else 0)

# Binary scoring for "Timely response?"
# The lambda function is applied to each value in the 'Timely response?' column of the DataFrame 'df'
# If the value is 'Yes', the function returns 1; otherwise, it returns 0
# The result is stored in a new column 'timely_response_score' in the DataFrame
df['timely_response_score'] = df['Timely response?'].apply(lambda x: 1 if x == 'Yes' else 0)

def calculate_satisfaction_score(row):
    """
    This function calculates the overall satisfaction score for a given row in the DataFrame.

    Parameters:
    row (pandas.Series): The row for which to calculate the satisfaction score.

    Returns:
    float: The satisfaction score. This is a weighted sum of the sentiment score, the company public response sentiment score, the response to consumer score, and the timely response score. The weights are 0.4, 0.15, 0.15, and 0.3, respectively.
    """
    # Weighted satisfaction score calculation
    # Here, the weights sum to 1
    total_score = (0.4 * row['sentiment'] +
                   0.15 * row['company_public_response_sentiment'] +
                   0.15 * row['response_to_consumer_score'] +
                   0.3 * row['timely_response_score'])
    return total_score

# Apply the 'calculate_satisfaction_score' function to each row of the DataFrame 'df'
# The function calculates the overall satisfaction score for each row
# The result is stored in a new column 'satisfaction_score' in the DataFrame
df['satisfaction_score'] = df.apply(calculate_satisfaction_score, axis=1)

In [13]:
df

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,...,Consumer disputed?,Complaint ID,combined_text,sentiment,sentiment_category,company_public_response_sentiment,company_consumer_response_sentiment,response_to_consumer_score,timely_response_score,satisfaction_score
0,0,18/02/2024,Credit reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else,I recently reviewed a copy of my credit report...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,TX,...,,8370453,I recently reviewed a copy of my credit report...,0.000000,Neutral,0.0,-0.1,0,1,0.300000
1,1,22/01/2024,Checking or savings account,Savings account,Problem caused by your funds being low,Non-sufficient funds and associated fees,I have had the Wells Fargo Bank for more than ...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,NC,...,,8203156,I have had the Wells Fargo Bank for more than ...,0.000000,Neutral,0.0,-0.1,0,1,0.300000
2,2,09/02/2024,Credit reporting,Credit reporting,Incorrect information on your report,Account status incorrect,A company called XXXX XXXX reported a delinque...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,VA,...,,8314624,A company called XXXX XXXX reported a delinque...,0.000000,Neutral,0.0,-0.1,0,1,0.300000
3,3,05/04/2024,Credit card or prepaid card,General-purpose credit card or charge card,Fees or interest,Problem with fees,I was charged a late fee in XX/XX/2024 of {$29...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,PA,...,,8706247,I was charged a late fee in XX/XX/2024 of {$29...,-0.071053,Negative,0.0,-0.1,1,1,0.421579
4,4,18/02/2024,Credit reporting,Credit reporting,Credit monitoring or identity theft protection...,Problem with product or service terms changing,I have written before to the credit bureau abo...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,DE,...,,8364886,I have written before to the credit bureau abo...,-0.187500,Negative,0.0,-0.1,0,1,0.225000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55494,55494,03/02/2021,Loan,Loan,Managing the loan or lease,Billing problem,We have paid our auto loan with 6 deferred pay...,Company has responded to the consumer and the ...,TRUIST FINANCIAL CORPORATION,CA,...,,4112772,We have paid our auto loan with 6 deferred pay...,0.000000,Neutral,0.0,-0.1,0,1,0.300000
55495,55495,01/04/2021,Credit reporting,Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...,This is yet my recurring complaint as Transuni...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",FL,...,,4262930,This is yet my recurring complaint as Transuni...,-0.065296,Negative,0.0,-0.1,0,1,0.273881
55496,55496,05/06/2021,Debt collection,Other debt,Attempts to collect debt not owed,Debt is not yours,In 2018 I became XXXX XXXX a few months after ...,Company believes it acted appropriately as aut...,"Blakely-Witt and Associates, Inc.",TX,...,,4433168,In 2018 I became XXXX XXXX a few months after ...,0.141809,Positive,0.5,-0.1,0,1,0.431724
55497,55497,07/06/2021,Credit reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else,Noticed an account on my report that does not ...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",NJ,...,,4439371,Noticed an account on my report that does not ...,0.000000,Neutral,0.0,-0.1,0,1,0.300000


In [14]:
df.to_csv('satisfaction_scores.csv', index=False)