In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !unzip /content/drive/MyDrive/datasets/data.zip -d /content/drive/MyDrive/datasets/data

Archive:  /content/drive/MyDrive/datasets/data.zip
  inflating: /content/drive/MyDrive/datasets/data/complaints_processed.csv  


In [35]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/datasets/data/complaints_processed.csv')
df = df.set_index('Unnamed: 0')
print(df.head());

                     product  \
Unnamed: 0                     
0                credit_card   
1                credit_card   
2             retail_banking   
3           credit_reporting   
4           credit_reporting   

                                                    narrative  
Unnamed: 0                                                     
0           purchase order day shipping amount receive pro...  
1           forwarded message date tue subject please inve...  
2           forwarded message cc sent friday pdt subject f...  
3           payment history missing credit report speciali...  
4           payment history missing credit report made mis...  


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162421 entries, 0 to 162420
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   product    162421 non-null  object
 1   narrative  162411 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB


In [37]:
df.isna().sum()

Unnamed: 0,0
product,0
narrative,10


In [38]:
df.dropna(inplace=True)

In [39]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py

--2024-09-12 07:08:28--  https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10246 (10K) [text/plain]
Saving to: ‘helper_functions.py.1’


2024-09-12 07:08:28 (84.0 MB/s) - ‘helper_functions.py.1’ saved [10246/10246]



In [40]:
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys
from sklearn.model_selection import train_test_split

In [41]:
from sklearn.preprocessing import LabelEncoder

In [42]:
le = LabelEncoder()
df['y_numeric'] = le.fit_transform(df['product'])

In [43]:
df['y_numeric'].unique()

array([0, 4, 1, 3, 2])

In [44]:
X = df["narrative"]
y = df['y_numeric']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [45]:
print(len(X_train),len(y_train),len(X_test),len(y_test))

108815 108815 53596 53596


In [46]:
# Convert all values in X_train to strings before applying split()
round(sum([len(str(i).split()) for i in X_train])/len(X_train))

81

In [47]:
import tensorflow as tf

In [48]:
from tensorflow.keras.layers import TextVectorization

For max_tokens (the number of words in the vocabulary), multiples of 10,000 (10,000, 20,000, 30,000) or the exact number of unique words in your text (e.g. 32,179) are common values.

In [49]:
text_vectorizer = TextVectorization(max_tokens=10000,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None, # create groups of n-words?
                                    output_mode="int", # how to map tokens to numbers
                                    output_sequence_length=80, # from the above cell
                                    pad_to_max_tokens=True)


In [50]:
X_train.dtype

dtype('O')

In [51]:
X_train = X_train.apply(str).tolist()

In [52]:
# Fit the text vectorizer to the training text
text_vectorizer.adapt(X_train)

In [55]:
# Create sample sentence and tokenize it
sample_sentence = "purchase order day shipping amount receive"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 80), dtype=int64, numpy=
array([[ 227,  261,   11, 2023,   39,  122,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0]])>

no matter the size of the sequence we pass to text_vectorizer, it always returns a sequence with a length of 80.

In [56]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

Number of words in vocab: 10000
Top 5 most common words: ['', '[UNK]', 'account', 'credit', 'report']
Bottom 5 least common words: ['obfuscation', 'obfuscate', 'nvr', 'nonprofit', 'noi']


We've got a way to map our text to numbers. How about we go a step further and turn those numbers into an embedding?

The powerful thing about an embedding is it can be learned during training. This means rather than just being static (e.g. 1 = I, 2 = love, 3 = TensorFlow), a word's numeric representation can be improved as a model goes through data samples.

We can see what an embedding of a word looks like by using the tf.keras.layers.Embedding layer.

The main parameters we're concerned about here are:

input_dim - The size of the vocabulary (e.g. len(text_vectorizer.get_vocabulary()).
output_dim - The size of the output embedding vector, for example, a value of 100 outputs a feature vector of size 100 for each word.
embeddings_initializer - How to initialize the embeddings matrix, default is "uniform" which randomly initalizes embedding matrix with uniform distribution. This can be changed for using pre-learned embeddings.
input_length - Length of sequences being passed to embedding layer.
Knowing these, let's make an embedding layer.


notice how embedding is a TensoFlow layer? This is important because we can use it as part of a model, meaning its parameters (word representations) can be updated and improved as the model learns.

In [57]:
tf.random.set_seed(42)

embedding = tf.keras.layers.Embedding(input_dim=10000, # set input shape
                             output_dim=128, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=80, # how long is each input
                             name="embedding_1")



In [58]:
# Get a random sentence from training set
import random

random_sentence = random.choice(X_train)
print(f"Original text:\n{random_sentence}\
      \n\nEmbedded version:")

# Embed the random sentence (turn it into numerical representation)
sample_embed = embedding(text_vectorizer([random_sentence]))
sample_embed

Original text:
loan originated sold specialized loan servicing original loan call loan payoff quote fee however applied refinance another lender loan pay fee charged statement called specialized loan servicing company got rude csr told charge anything want charge pay told loan purchased old loan contract longer governed asked supervisor denied speak anyone else feel company cheating people charging fee allowed charge least allowed constraint original contract purchased wonder many people cheated like company      

Embedded version:


<tf.Tensor: shape=(1, 80, 128), dtype=float32, numpy=
array([[[ 0.03447274, -0.03356683, -0.02829382, ...,  0.00927123,
         -0.04236375, -0.00734476],
        [ 0.01083285, -0.03804957,  0.018349  , ..., -0.03762488,
         -0.00852275,  0.04419606],
        [-0.04976238, -0.03998647, -0.01509665, ..., -0.0254895 ,
          0.03162172,  0.04935999],
        ...,
        [ 0.00226654,  0.03263614,  0.03409865, ...,  0.03011138,
          0.03977481, -0.01545439],
        [ 0.00226654,  0.03263614,  0.03409865, ...,  0.03011138,
          0.03977481, -0.01545439],
        [ 0.00226654,  0.03263614,  0.03409865, ...,  0.03011138,
          0.03977481, -0.01545439]]], dtype=float32)>

Naive Bayes is the simplest to acquire a baseline which we'll expect each other of the other deeper models to beat.

Each experiment will go through the following steps:

Construct the model
Train the model
Make predictions with the model
Track prediction evaluation metrics for later comparison

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Fit the pipeline to the training data
model_0.fit(X_train,y_train)

In [62]:
baseline_score = model_0.score(X_test,y_test)
print(baseline_score)

0.8054145831778491


In [63]:
# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [65]:
# Get baseline results
baseline_preds = model_0.predict(X_test)
baseline_results = calculate_results(y_true=y_test,
                                     y_pred=baseline_preds)
baseline_results

{'accuracy': 80.54145831778492,
 'precision': 0.8141066586900144,
 'recall': 0.8054145831778491,
 'f1': 0.79019313406408}

In [None]:
# !pip install joblib

import joblib
# Save the model
joblib.dump(model_0,'model_0.pkl')