In [1]:
# intalling depandances 
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import LatentDirichletAllocation
import joblib

In [17]:
# Downloading stopwords
nltk.download('stopwords')

# Loading the dataset
df = pd.read_csv('/kaggle/input/complains/complaints_processed.csv')

# Displaying the first few rows of the dataframe
df.head()


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [18]:
# Defining stop words
stop_words = set(stopwords.words('english'))

# Preprocessing text data
def preprocess_text(text):
    if isinstance(text, str):
        tokens = [word for word in text.split() if word.lower() not in stop_words]
        return ' '.join(tokens)
    else:
        return ''

# Applying preprocessing
df['processed_complaint'] = df['narrative'].apply(preprocess_text)

In [19]:
# Spliting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_complaint'], df['product'], test_size=0.2, random_state=42)

# Creating a pipeline with TF-IDF vectorizer, StandardScaler, and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),  # Add StandardScaler
    ('clf', LogisticRegression(max_iter=1000))  # Increased max_iter
])

In [20]:
# Training the model
pipeline.fit(X_train, y_train)

# Predicting on the test set
y_pred = pipeline.predict(X_test)


print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.7785747267969833
Classification Report:
                     precision    recall  f1-score   support

        credit_card       0.58      0.57      0.58      3212
   credit_reporting       0.87      0.90      0.89     18130
    debt_collection       0.66      0.63      0.65      4619
mortgages_and_loans       0.66      0.64      0.65      3738
     retail_banking       0.70      0.67      0.68      2786

           accuracy                           0.78     32485
          macro avg       0.69      0.68      0.69     32485
       weighted avg       0.78      0.78      0.78     32485



In [21]:
# Saving the classification model
joblib.dump(pipeline, 'text_classification_model.pkl')

# Vectorize the text data using TF-IDF for LDA
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['processed_complaint'])

# Defining and train the LDA model
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(tfidf)

# Displaying the topics
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))

display_topics(lda, tfidf_vectorizer.get_feature_names_out(), 10)

# Saving the LDA model and the TF-IDF vectorizer
joblib.dump(lda, 'lda_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

Topic #0:
debt credit reporting information report consumer letter law alleged collection
Topic #1:
debt collection company credit paid car account loan vehicle report
Topic #2:
loan mortgage payment forbearance escrow home month paid company time
Topic #3:
credit account report reporting information bureau inquiry inaccurate dispute removed
Topic #4:
payment card account credit bank late time told called month
Topic #5:
debt account credit collection report company inquiry creditor original reporting
Topic #6:
item report identity account credit remove theft unknown pulled fraudulent
Topic #7:
acct charge opened account balance act fraudulent dispute response writing
Topic #8:
account card bank money fraud number credit transaction fund phone
Topic #9:
pnc consumer link account block connect shall section agency information


['tfidf_vectorizer.pkl']