In [1]:
import pandas as pd

# Load the data from the JSON file
complaints_data = pd.read_json("complaints-2023-08-25_18_02.json")

# Display the first few rows of the data
complaints_data.head()

Unnamed: 0,_index,_type,_id,_score,_source,sort
0,complaint-public-v2,_doc,5471601,,"{'product': 'Checking or savings account', 'co...",[16]
1,complaint-public-v2,_doc,5020019,,"{'product': 'Checking or savings account', 'co...",[108]
2,complaint-public-v2,_doc,7203230,,"{'product': 'Vehicle loan or lease', 'complain...",[136]
3,complaint-public-v2,_doc,3743284,,"{'product': 'Checking or savings account', 'co...",[156]
4,complaint-public-v2,_doc,2927362,,"{'product': 'Credit reporting, credit repair s...",[188]


In [2]:
# Extract the relevant details from the _source column
complaints_details = complaints_data['_source'].apply(pd.Series)

# Filter entries where product is "Vehicle loan or lease"
vehicle_complaints = complaints_details[complaints_details['product'] == 'Vehicle loan or lease']

# Filter out entries without narratives
vehicle_complaints_with_narrative = vehicle_complaints[vehicle_complaints['complaint_what_happened'].notnull()]
vehicle_complaints_with_narrative

# Retrieve only the issues and the related narratives
issues_and_narratives = vehicle_complaints_with_narrative[['issue', 'complaint_what_happened']]
issues_and_narratives.head()

# Filter out entries with empty narratives
cleaned_issues_and_narratives = issues_and_narratives[issues_and_narratives['complaint_what_happened'].str.strip() != ""]
cleaned_issues_and_narratives.head()

Unnamed: 0,issue,complaint_what_happened
8,Managing the loan or lease,Yes I called ally bank about getting exstentio...
13,Managing the loan or lease,This issue has persisted for multiple years de...
17,Getting a loan or lease,I received notice dated XX/XX/2019 from Ally B...
19,Managing the loan or lease,Shortly after purchasing the vehicle I cancele...
23,Problems at the end of the loan or lease,Ally Financial was the lender used to lease a ...


In [3]:
# Recount the issues after filtering out empty narratives and rank them in descending order
cleaned_issue_counts = cleaned_issues_and_narratives['issue'].value_counts()
cleaned_issue_counts

Managing the loan or lease                                                          612
Problems at the end of the loan or lease                                            467
Struggling to pay your loan                                                         312
Getting a loan or lease                                                             230
Incorrect information on your report                                                142
Problem with a credit reporting company's investigation into an existing problem     90
Improper use of your report                                                          31
Credit monitoring or identity theft protection services                               5
Unable to get your credit report or credit score                                      2
Problem with fraud alerts or security freezes                                         1
Name: issue, dtype: int64

In [4]:
# 1.1 Select the top 5 issues
top_5_issues = cleaned_issue_counts.head(5).index
filtered_data = cleaned_issues_and_narratives[cleaned_issues_and_narratives['issue'].isin(top_5_issues)]
filtered_data

Unnamed: 0,issue,complaint_what_happened
8,Managing the loan or lease,Yes I called ally bank about getting exstentio...
13,Managing the loan or lease,This issue has persisted for multiple years de...
17,Getting a loan or lease,I received notice dated XX/XX/2019 from Ally B...
19,Managing the loan or lease,Shortly after purchasing the vehicle I cancele...
23,Problems at the end of the loan or lease,Ally Financial was the lender used to lease a ...
...,...,...
14950,Managing the loan or lease,"Major, FRAUDULENT Billing Errors, with : ALLY ..."
14958,Managing the loan or lease,My car in XXXX was Fanancial by ally they call...
14968,Managing the loan or lease,Purchased a XXXX XXXX XXXX in XXXX of XX/XX/XX...
14985,Struggling to pay your loan,"Ally Financial contacted my neighbors, to whom..."


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the data into training and test sets
X = filtered_data['complaint_what_happened']
y = filtered_data['issue']

In [6]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation, and test sets
# First, let's split the data into training (60%) and temp (40%)
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Next, we'll split the temp data into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_temp)

# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Show the shape of resulting TF-IDF matrices
X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape


((1057, 5000), (353, 5000), (353, 5000))

In [8]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
le = LabelEncoder()

# Fit and transform the labels
y_train_temp_encoded = le.fit_transform(y_train_temp)
y_val_encoded = le.transform(y_val)
y_test_encoded = le.transform(y_test)


In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for XGBoost
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.3, 0.5, 0.7]
}

# Initialize the XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Initialize GridSearchCV
xgb_grid_search = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, 
                               cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the model
xgb_grid_search.fit(X_train_tfidf, y_train_temp_encoded)

# Get the best parameters
xgb_best_params = xgb_grid_search.best_params_
xgb_best_params




Fitting 3 folds for each of 243 candidates, totalling 729 fits


{'colsample_bytree': 0.5,
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 100,
 'subsample': 1.0}

In [10]:
best_xgb = XGBClassifier(
    colsample_bytree=0.5,
    learning_rate=0.1,
    max_depth=10,
    n_estimators=100,
    subsample=1.0,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

In [11]:
best_xgb.fit(X_train_tfidf, y_train_temp_encoded)

In [12]:
y_val_pred = best_xgb.predict(X_val_tfidf)

In [14]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_val = accuracy_score(y_val_encoded, y_val_pred)
classification_rep = classification_report(y_val_encoded, y_val_pred)

print(f"Validation Accuracy: {accuracy_val}")
print(f"Classification Report: \n{classification_rep}")

Validation Accuracy: 0.6345609065155807
Classification Report: 
              precision    recall  f1-score   support

           0       0.62      0.57      0.59        46
           1       0.56      0.31      0.40        29
           2       0.61      0.76      0.67       123
           3       0.70      0.74      0.72        93
           4       0.63      0.44      0.51        62

    accuracy                           0.63       353
   macro avg       0.62      0.56      0.58       353
weighted avg       0.63      0.63      0.62       353



In [15]:
y_test_pred = best_xgb.predict(X_test_tfidf)
accuracy_test = accuracy_score(y_test_encoded, y_test_pred)
classification_rep_test = classification_report(y_test_encoded, y_test_pred)

print(f"Test Accuracy: {accuracy_test}")
print(f"Test Classification Report: \n{classification_rep_test}")


Test Accuracy: 0.5892351274787535
Test Classification Report: 
              precision    recall  f1-score   support

           0       0.59      0.50      0.54        46
           1       0.31      0.18      0.23        28
           2       0.55      0.66      0.60       122
           3       0.64      0.67      0.66        94
           4       0.69      0.59      0.63        63

    accuracy                           0.59       353
   macro avg       0.56      0.52      0.53       353
weighted avg       0.58      0.59      0.58       353

