In [7]:
from nltk import SnowballStemmer
import pandas as pd
import re

# Initialize SnowballStemmer with English language
snow_stemmer = SnowballStemmer(language='english')

# Import necessary libraries for machine learning and text processing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
import spacy

def stemming_tokenizer(str_input):
    """
    This function tokenizes the input string and applies stemming to each token.

    Parameters:
    str_input (str): The input string to be tokenized and stemmed.

    Returns:
    list: The list of stemmed tokens.
    """
    words = re.sub(r"[^A-Za-z]", " ", str_input).lower().split()
    words = [snow_stemmer.stem(word) for word in words]
    return words

def TAB_dfm(text, ngrams_range = (1,2), stop_words = 'english', min_prop = .01, max_features=None):
    """
    This function applies CountVectorizer to the input text and returns a DataFrame and a matrix representation of the text.

    Parameters:
    text (str): The input text to be vectorized.
    ngrams_range (tuple): The range of n-values for different n-grams to be extracted.
    stop_words (str): 'english' if English stop words are to be removed, else False.
    min_prop (float): The minimum proportion of documents a word must be present in for it to be kept.
    max_features (int): The maximum number of features to be kept, based on term frequency.

    Returns:
    pandas.DataFrame: DataFrame representation of the text.
    numpy.matrix: Matrix representation of the text.
    """
    if stop_words == 'english':
        vec = CountVectorizer(
            tokenizer = stemming_tokenizer,
            stop_words = stop_words,
            ngram_range=ngrams_range,
            min_df=min_prop,
            max_features=max_features,
            token_pattern='(?u)\\b\\w+\\b'
            )
    else:
        vec = CountVectorizer(
            tokenizer = stemming_tokenizer,
            ngram_range=ngrams_range,
            min_df=min_prop,
            max_features=max_features,
            token_pattern='(?u)\\b\\w+\\b'
        )

    mtx = vec.fit_transform(text).todense()
    df = round(pd.DataFrame(mtx, columns=vec.get_feature_names_out()),2)
    return df, mtx

def kendall_acc(x,y,percentage = True):
    """
    This function calculates the Kendall's tau-a correlation coefficient between two lists.

    Parameters:
    x, y (list): The two lists for which to calculate the correlation coefficient.
    percentage (bool): If True, the result is returned as a percentage.

    Returns:
    pandas.DataFrame: A DataFrame containing the correlation coefficient, lower and upper confidence intervals.
    """
    tau, p_value = stats.kendalltau(x, y)
    tau_acc = .5+tau/2
    tau_se = np.sqrt((tau_acc*(1 - tau_acc))/len(x))
    report = pd.DataFrame([tau_acc, tau_acc - 1.96 * tau_se, tau_acc + 1.96 * tau_se],
                          index = ['acc', 'lower', 'upper']).T
    report = round(report,4)

    if percentage is True:
        report = report * 100

    return report

def jaccard_sim(str1, str2):
    """
    This function calculates the Jaccard similarity between two strings.

    Parameters:
    str1, str2 (str): The two strings for which to calculate the Jaccard similarity.

    Returns:
    float: The Jaccard similarity between the two strings.
    """
    a = set(stemming_tokenizer(str1))
    b = set(stemming_tokenizer(str2))
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def euclidian_dist(docs, y = 0):
    """
    This function calculates the Euclidean distance between the vectors of a list of documents and a specific document.

    Parameters:
    docs (list): The list of documents.
    y (int): The index of the specific document.

    Returns:
    list: The list of Euclidean distances.
    """
    _, features = np.asarray(TAB_dfm(docs))
    distances = [round(float(euclidean_distances([features[y]], [f])),2) for f in features]
    return distances

def cosine_sim(docs, y = 0):
    """
    This function calculates the cosine similarity between the vectors of a list of documents and a specific document.

    Parameters:
    docs (list): The list of documents.
    y (int): The index of the specific document.

    Returns:
    list: The list of cosine similarities.
    """
    _, features = np.asarray(TAB_dfm(docs, stop_words = False))
    distances = [round(float(cosine_similarity([features[y]], [f])),2) for f in features]
    return distances

def spacy_parse(text):
    """
    This function parses a text using the Spacy library and returns a DataFrame containing the parsed information.

    Parameters:
    text (str): The text to be parsed.

    Returns:
    pandas.DataFrame: A DataFrame containing the parsed information.
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    rows = [[t.text, t.lemma_, t.pos_, t.tag_, t.dep_, spacy.explain(t.pos_), t.is_stop] for t in doc]
    cols = ("text", "lemma", "POS", "Tag","Dep","explain", "stopword")
    df = pd.DataFrame(rows, columns=cols)
    return df

def lemmas_parse(text):
    """
    This function parses a text and returns a string of lemmas that are not pronouns, numbers, symbols, stopwords, spaces or punctuations.

    Parameters:
    text (str): The text to be parsed.

    Returns:
    str: A string of lemmas.
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    lemmas = [t.lemma_ for t in doc
              if t.pos_ not in ('SPACE', 'PRON', 'PUNCT', 'NUM', 'SYM')
              if t.is_stop == False]
    return ' '.join(lemmas)

def lemmas_dfm(texts):
    """
    This function applies the lemmas_parse function to a list of texts and returns a DataFrame of the results.

    Parameters:
    texts (list): The list of texts.

    Returns:
    pandas.DataFrame: A DataFrame of the results.
    """
    dfms_joined = pd.DataFrame()
    for text in texts:
        text = [lemmas_parse(text)]
        if len(text[0]) > 1:
            dfm, _ = TAB_dfm(text, ngrams_range=(0,1), stop_words = False)
            dfms_joined = dfms_joined.append(dfm)
    return dfms_joined

def ner_parse(text):
    """
    This function parses a text using the Spacy library and returns a DataFrame containing the named entities.

    Parameters:
    text (str): The text to be parsed.

    Returns:
    pandas.DataFrame: A DataFrame containing the named entities.
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    rows = [[ent.text, ent.start_char, ent.end_char, ent.label_] for ent in doc.ents]
    cols = ("Text", "Start", "End", "Label")
    df = pd.DataFrame(rows, columns=cols)
    return df

def ner_filter_parse(text):
    """
    This function parses a text using the Spacy library and returns a string of unique named entities that are geographical places.

    Parameters:
    text (str): The text to be parsed.

    Returns:
    str: A string of unique named entities that are geographical places.
    """
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    ents = [ent.text for ent in doc.ents
            if ent.label_ == 'GPE']
    return ' '.join(list(set(ents)))

def ner_dfm(texts):
    """
    This function applies the ner_filter_parse function to a list of texts and returns a DataFrame of the results.

    Parameters:
    texts (list): The list of texts.

    Returns:
    pandas.DataFrame: A DataFrame of the results.
    """
    dfms_joined = pd.DataFrame()
    for text in texts:
        text = [ner_filter_parse(text)]
        if len(text[0]) > 1:
            dfm, _ = TAB_dfm(text, ngrams_range=(0,1), stop_words = False)
            dfms_joined = dfms_joined.append(dfm)
    return dfms_joined

def tokenizer(str_input):
    """
    This function tokenizes the input string.

    Parameters:
    str_input (str): The input string to be tokenized.

    Returns:
    list: The list of tokens.
    """
    words = re.sub(r"[^A-Za-z]", " ", str_input).lower().split()
    return words

def dfm_lookup(text, dict_as_list, ngrams_range = (1,1), min_prop = .01, max_features=None):
    """
    This function applies CountVectorizer to the input text and returns a DataFrame and a matrix representation of the text.

    Parameters:
    text (str): The input text to be vectorized.
    dict_as_list (list): The list of words to be used as the dictionary.
    ngrams_range (tuple): The range of n-values for different n-grams to be extracted.
    min_prop (float): The minimum proportion of documents a word must be present in for it to be kept.
    max_features (int): The maximum number of features to be kept, based on term frequency.

    Returns:
    pandas.DataFrame: DataFrame representation of the text.
    numpy.matrix: Matrix representation of the text.
    """
    vec = CountVectorizer(
        tokenizer = tokenizer,
        stop_words = 'english',
        ngram_range=ngrams_range,
        min_df=min_prop,
        max_features=max_features,
        token_pattern='(?u)\\b\\w+\\b'
        )

    mtx = vec.fit_transform(text).todense()
    df = round(pd.DataFrame(mtx, columns=vec.get_feature_names_out()),2)
    df = df[df.columns.intersection(dict_as_list)]
    row_sums = df.sum(axis=1)
    return row_sums

In [8]:
# Import the pandas library as pd
import pandas as pd

# Read the 'vecSmall.csv' file
# The first column of the file is used as the index of the DataFrame
# The resulting DataFrame is stored in the variable 'vecSmall'
vecSmall = pd.read_csv('vecSmall.csv', index_col= 0)

# Read the 'wfFile.csv' file
# The first column of the file is used as the index of the DataFrame
# The resulting DataFrame is stored in the variable 'wfFile'
wfFile = pd.read_csv('wfFile.csv', index_col= 0)

# Read the 'filtered_data_new.csv' file
# The first column of the file is used as the index of the DataFrame
# The 'low_memory' parameter is set to False to silence a warning about column types
# The resulting DataFrame is stored in the variable 'data'
data = pd.read_csv('filtered_dataset.csv', index_col= 0, low_memory=False)

In [4]:
from sklearn.model_selection import ShuffleSplit

# Instantiate the ShuffleSplit class with 1 split, a test size of 40%, and a random state of 42 for reproducibility
# This will be used to create a random split of the data into training and testing sets
sss = ShuffleSplit(n_splits=1, test_size=0.4, random_state = 42)

# Get the number of splitting iterations in the cross-validator
# This is not necessary for the split but can be used to check the number of splits
sss.get_n_splits(data)

# Generate indices to split data into training and test set
# next() is used to get the next item from the iterator
train_index, test_index = next(sss.split(data))

# Use the generated indices to create the training set
# iloc is used for indexing via integers
data_train = data.iloc[train_index]

# Use the generated indices to create the test set
data_test = data.iloc[test_index]

In [5]:
# Import the lowercase function from the pyodbc library
from pyodbc import lowercase

# The following code is a Python equivalent to the vecCheck function in the vectorFunctions.R script

# Define a pipeline for projecting data into embedding space
# The pipeline consists of two steps:
# 1. TfidfVectorizer: This is used to convert the text data into a matrix of TF-IDF features.
#    The vocabulary is set to the index of the wfFile DataFrame and the lowercase parameter is set to False to keep uppercase characters.
# 2. TruncatedSVD: This is used for dimensionality reduction. It transforms the data to have the same number of dimensions as the pre-trained model.
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(vocabulary=wfFile.index, lowercase=False)),  
    ('lsa', TruncatedSVD(n_components=vecSmall.shape[1])),  
])

In [6]:
# Fit the pipeline to the 'Consumer complaint narrative' column of the data DataFrame
# This step involves transforming the text data into a matrix of TF-IDF features and then reducing the dimensionality of the data
pipeline.fit(data['Consumer complaint narrative'])

# Transform the 'Consumer complaint narrative' column of the data DataFrame using the fitted pipeline
# This step involves projecting the data into the embedding space
vdat = pipeline.transform(data['Consumer complaint narrative'])

# Convert the embedded data into a DataFrame
# The column names are generated dynamically based on the number of dimensions in the embedded data
# Each column represents a dimension in the embedding space
vdat = pd.DataFrame(vdat, columns=[f'vec{i+1}' for i in range(vdat.shape[1])])

# Print the first 10 rows of the DataFrame
# This is used to check the transformed data
print(vdat.head(10))

KeyboardInterrupt: 

In [None]:
# Select the training data from the transformed DataFrame 'vdat' using the training indices
vdat_train = vdat.iloc[train_index]

# Select the testing data from the transformed DataFrame 'vdat' using the testing indices
vdat_test = vdat.iloc[test_index]

In [None]:
vdat

#############################################
# Train a vector classifier
#############################################

In [None]:
# Calculate the frequency of each category in 'Product'
frequency = data_train['Product'].value_counts(normalize=True)

# Map the frequencies to the training data
data_train['Product'] = data_train['Product'].map(frequency)

# Map the frequencies to the testing data
# Note: categories in the testing data that are not found in the training data will be replaced with NaN
data_test['Product'] = data_test['Product'].map(frequency)

# Now you can fit the Lasso model and make predictions
Lasso_vec = Lasso(alpha = 0.001)
Lasso_vec.fit(vdat_train,  data_train['Product'])
test_predict = Lasso_vec.predict(vdat_test)

# Estimate accuracy
vec_acc = kendall_acc(test_predict, data_test['Product'])

In [None]:
vec_acc

#############################################
# vector embeddings + ngrams
#############################################

In [None]:
# Apply the TAB_dfm function to the 'Consumer complaint narrative' column of the training and testing data
# The function returns a DataFrame and a matrix representation of the text
# Only the DataFrame is used, so the matrix is discarded by assigning it to _
data_dfm_train, _ = TAB_dfm(data_train['Consumer complaint narrative'])
data_dfm_test, _ = TAB_dfm(data_test['Consumer complaint narrative'], min_prop = 0)

# Create a list of lists containing the column names of the training and testing DataFrames
d = [list(data_dfm_train), list(data_dfm_test)]

# Find the intersection of the column names of the training and testing DataFrames
# This is done to ensure that both DataFrames have the same columns
col_heads = list(set.intersection(*map(set,d)))

# Reset the indices of the training and testing DataFrames
# This is done to ensure that the indices are consistent after concatenating the DataFrames
data_dfm_train= data_dfm_train[col_heads].reset_index(drop = True)
data_dfm_test = data_dfm_test[col_heads].reset_index(drop = True)
vdat_train = vdat_train.reset_index(drop = True)
vdat_test = vdat_test.reset_index(drop = True)

# Concatenate the training DataFrames along the columns
# This results in a DataFrame that includes both the vector embeddings and the n-grams
combined_x_train = pd.concat([vdat_train, data_dfm_train], axis = 1)

# Concatenate the testing DataFrames along the columns
# This results in a DataFrame that includes both the vector embeddings and the n-grams
combined_x_test = pd.concat([vdat_test, data_dfm_test], axis = 1)

In [None]:
print(vdat_train.shape)
print(data_dfm_train.shape)
print(combined_x_train.shape)

In [None]:
# Instantiate the Lasso model with an alpha of 0.001
# The alpha parameter controls the degree of regularization (higher values mean more regularization and simpler models)
lasso_all = Lasso(alpha = 0.001)

# Fit the Lasso model to the training data
# The independent variables are the combined vector embeddings and n-grams (combined_x_train)
# The dependent variable is the 'Product' column of the training data
lasso_all.fit(combined_x_train, data_train['Product'])

# Use the fitted model to make predictions on the testing data
# The independent variables are the combined vector embeddings and n-grams of the testing data (combined_x_test)
# The predictions are stored in the variable 'test_all_predict'
test_all_predict = lasso_all.predict(combined_x_test)

# Estimate the accuracy of the predictions

# The accuracy estimate is stored in the variable 'ngram_vec_acc'
ngram_vec_acc = kendall_acc(test_all_predict, data_test['Product'])

# Print the accuracy estimate
# This is used to check the performance of the model
print(ngram_vec_acc)

#############################################
# ngrams alone
#############################################

In [None]:

lasso_dfm = Lasso(alpha = 0.001)

# Fit the Lasso model to the training data
# The independent variables are the n-grams (data_dfm_train)
# The dependent variable is the 'Product' column of the training data
lasso_all.fit(data_dfm_train, data_train['Product'])

# Use the fitted model to make predictions on the testing data
# The independent variables are the n-grams of the testing data (data_dfm_test)
# The predictions are stored in the variable 'test_dfm_predict'
test_dfm_predict = lasso_all.predict(data_dfm_test)

# Estimate the accuracy of the predictions
# The accuracy estimate is stored in the variable 'ngram_acc'
ngram_acc = kendall_acc(test_dfm_predict, data_test['Product'])


# This is used to check the performance of the model
print(ngram_acc)

#############################################
# Logistic Regression
#############################################

In [9]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize TF-IDF Vectorizer
# The stop_words parameter is set to 'english' to remove English stop words from the text
# The max_features parameter is set to 1000 to only consider the top 1000 terms ordered by term frequency
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

# Fit the TF-IDF Vectorizer to the 'Consumer complaint narrative' column of the data DataFrame
# This step involves learning the vocabulary of the text and calculating the inverse document frequencies
# Then, transform the text into a matrix of TF-IDF features
X_tfidf = tfidf.fit_transform(data['Consumer complaint narrative'])

# Split the TF-IDF features and the 'Product' column of the data DataFrame into training and testing sets
# The test_size parameter is set to 0.6, meaning that 60% of the data will be used for testing and 40% for training
# The random_state parameter is set to 42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['Product'], test_size=0.6, random_state=42)

In [10]:
# Initialize logistic regression model
model = LogisticRegression(max_iter=10000)

# Train the model
model.fit(X_train, y_train)

In [11]:
# Predict on the test data
y_pred = model.predict(X_test)

# Print classification report
# Estimate the accuracy of the predictions

logistic_acc = kendall_acc(y_pred, y_test)

# Print the accuracy estimate
# This is used to check the performance of the model
print(logistic_acc)

     acc  lower  upper
0  88.19  87.85  88.54


########################################
# Benchmarks
########################################

In [None]:
# Add a new column 'wdct' to the testing data DataFrame
# The values of this column are the word counts of the 'Consumer complaint narrative' column
# The word count is calculated by splitting the narrative into words and counting the number of words
data_test['wdct'] = data_test['Consumer complaint narrative'].apply(lambda x: len(str(x).split()))

# Add a new column 'sentiment' to the testing data DataFrame
# The values of this column are the sentiment scores of the 'Consumer complaint narrative' column
# The sentiment score is calculated using the TextBlob library, which returns a polarity score between -1 (negative) and 1 (positive)
data_test['sentiment'] = data_test['Consumer complaint narrative'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)


# The result is stored in the variable 'wdct_acc'
wdct_acc = kendall_acc(data_test['wdct'], data_test['Product'])


# The result is stored in the variable 'sentiment_acc'
sentiment_acc = kendall_acc(data_test['sentiment'], data_test['Product'])


# These are used to check the performance of the word count and sentiment score as predictors of the product
print(wdct_acc)
print(sentiment_acc)

########################################
# Combine accuracy estimates for a plot
########################################

In [None]:
# Concatenate the accuracy estimates from different models into a single DataFrame
# The models include: ngram model, vector model, combined ngram and vector model, word count model, and sentiment model
plot_dat = pd.concat([ngram_acc, vec_acc, ngram_vec_acc, wdct_acc, sentiment_acc, logistic_acc])

# Add a new column 'features' to the DataFrame
# The values of this column are the names of the feature sets used in the models
plot_dat['features'] = ['ngrams', 'w2v', 'ngrams+w2v', 'word count', 'sentiment', 'logistic']

# Add a new column 'err' to the DataFrame
# The values of this column are the differences between the accuracy estimates and the lower confidence intervals
# This is used to calculate the error bars for the plot
plot_dat['err'] = plot_dat['acc'] - plot_dat['lower']

# Print the DataFrame
# This is used to check the accuracy estimates and error bars
print(plot_dat)

In [None]:

from matplotlib import pyplot as plt

fig = plt.figure()
plt.errorbar(y = plot_dat['features'], x = plot_dat['acc'], xerr=plot_dat['err'], fmt="o", color="b", elinewidth=.9, markersize=8, capsize=10)

plt.grid(False)
plt.axvline(x=50, color='lightgrey', linestyle='-')
# add axis labels
plt.xlabel('Accuracy', fontsize=18)
plt.ylabel('Feature set', fontsize=18)
# Makes the margins a bit wider (useful when there's only 2 points)
plt.margins(0.1, tight=True)

# set the height of the yaxis to be proportional to the data
plt.xlim(right=((max(plot_dat['acc']) + max(plot_dat['err']))) * 1.02,
         left=((min(plot_dat['acc']) - min(plot_dat['err']))) * 0.98)

plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.show()
