## Packages for keyword extraction
- gensim: A library for topic modeling, which can also be used for extracting key phrases using the TextRank algorithm.
- nltk: The Natural Language Toolkit provides various tools and algorithms for natural language processing, including key phrase extraction.
- spaCy: A powerful library for natural language processing, which includes functionality for extracting key phrases.
- summa: A library specifically designed for text summarization, but it also includes a keyword extraction module that can be used for extracting key phrases.
- pytextrank: A Python implementation of the TextRank algorithm, which can be used for key phrase extraction.
- yake: 
- kpminer:  
- pke: 

##### Unsupervised graph based Keyword Extraction Models 
- TextRank
- SingleRank
- TopicRank
- TopicalPageRank
- PositionRank
- MultipartiteRank

##### statistical keyword extraction models 
- TF-IDF
- KPMiner
- YAKE!

##### data processing packages used. 
- pandas, polars , contractions, symspellpy

In [1]:
import pandas as pd
import polars as pl
from spello.model import SpellCorrectionModel
import io, re, string, spello,requests , zipfile, os, nltk, spacy , pytextrank
from pathlib import Path
from bs4 import BeautifulSoup # For removing HTML
import contractions # For expanding contractions
from unidecode import unidecode # For handling accented words

/home/zjc1002/envs/key_phrase_extraction/lib/python3.11/site-packages


In [2]:
def download_and_unzip(url: str, destination_folder: str):
    
    """
    Downloads a file from the given URL and extracts its contents to the specified destination folder.

    Args:
        url (str): The URL of the file to download.
        destination_folder (str): The path to the folder where the contents of the zip file will be extracted.

    Returns:
        None

    Raises:
        None
    """

    # Send a GET request to download the file
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:

        # Read the content of the response
        content = response.content

        # Create a file-like object from the response content
        file = io.BytesIO(content)

        # Extract the contents of the zip file
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
    else:
        print("Failed to download the file.")

def correct_spelling( model: SpellCorrectionModel,text: str) -> str:
    """
    Corrects the spelling of the given text using the specified spell correction model.

    Args:
        text (str): The text to correct.
        model (SpellCorrectionModel): The spell correction model to use.

    Returns:
        str: The corrected text.

    Raises:
        None
    """
    # Correct the spelling of the text
    corrected_text = model.spell_correct(text)['spell_corrected_text']

    return corrected_text

def preprocess_text(text: str):
    def remove_html(text):
        soup = BeautifulSoup(text)
        text = soup.get_text()
        return text

    def remove_urls(text):
        pattern = re.compile(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?')
        text = re.sub(pattern, "", text)
        return text

    def remove_emails(text):
        pattern = re.compile(r"[\w\.-]+@[\w\.-]+\.\w+")
        text = re.sub(pattern, "", text)
        return text

    def handle_accents(text):
        text = unidecode(text)
        return text

    def remove_unicode_chars(text):
        text = text.encode("ascii", "ignore").decode()
        return text

    def remove_punctuations(text):
        text = re.sub('[%s]' % re.escape(string.punctuation), " ",text)
        return text

    def remove_digits(text):
        pattern = re.compile("\w*\d+\w*")
        text = re.sub(pattern, "",text)
        return text

    def remove_stopwords(text):
        return " ".join([word for word in str(text).split() if word not in stop_words])

    def remove_extra_spaces(text):
        text = re.sub(' +', ' ', text).strip()
        return text
    
    #return remove_html(contractions.fix(remove_urls(remove_emails(handle_accents(remove_unicode_chars(remove_punctuations(remove_digits(remove_stopwords(remove_extra_spaces(correct_spelling(sp_model,text)))))))))))
    return remove_html(contractions.fix(remove_urls(remove_emails(handle_accents(remove_unicode_chars(remove_punctuations(remove_digits(remove_stopwords(remove_extra_spaces(text))))))))))


In [50]:

cfg = {'data_path':'/home/zjc1002/Mounts/data/cfpb/cfpb_complaints.csv'
       , 'incols':['Date received', 'Product','Consumer complaint narrative','Company public response']
       , 'text_cols':['Consumer complaint narrative','Company public response']
       , 'spell_correct_model_download_dir':"/home/zjc1002/Mounts/temp/"
       , 'spell_correct_model_url': "https://haptik-website-images.haptik.ai/spello_models/en_large.pkl.zip"
       }


#manually download spacy model to disk for use in future 
cache_dir="/home/zjc1002/Mounts/temp/"
model_path="en_core_web_lg"

# URL of the file to download
url = "https://haptik-website-images.haptik.ai/spello_models/en_large.pkl.zip"
destination_folder = "/home/zjc1002/Mounts/temp/"

incols = ['Date received', 'Product','Consumer complaint narrative','Company public response']
text_cols = ['Consumer complaint narrative','Company public response']
n_samp = 1000 

In [4]:

#download spacy model 
if not os.path.exists(Path(cache_dir,model_path).as_posix()):
    spacy.cli.download(model_path)

#load spacy model 
nlp = spacy.load(model_path)
nlp.to_disk(os.path.join(cache_dir,model_path))
nlp = spacy.load(os.path.join(cache_dir,model_path))

#download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords # For removing stopwords
stop_words = set(stopwords.words('english'))

#download the spell correction model (just copy in paste if behind firewall) 
if os.path.exists(destination_folder):
    print("The spell check model folder exists.")
else:
    download_and_unzip(url, destination_folder)

#spell correction model, we dont use it 
#sp = SpellCorrectionModel(language='en')
#sp.load(Path(destination_folder,url.split('/')[-1].replace('.zip','')))

The spell check model folder exists.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/zjc1002/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
# Read the CSV file into a polars DataFrame
df = (pl.read_csv(cfg['data_path'], has_header=True)[incols]).drop_nulls(subset=text_cols)

# Create a new column 'input_txt' by concatenating 'Consumer complaint narrative' and 'Company public response'
df = (df.with_columns(pl.concat_str([pl.col('Consumer complaint narrative')
                                    , pl.col('Company public response')]).alias('input_col'))
                                    ).select(pl.col("*").exclude(text_cols))


#preprocess text
#df = df.map_rows(lambda t: preprocess_text(t[2]))
df = df.with_columns(pl.col("input_col").map_elements(preprocess_text, return_dtype=str, strategy= 'thread_local'))

#Show the resulting DataFrame
print(df.head())

shape: (5, 3)
┌───────────────┬───────────────────────────────────┬───────────────────────────────────┐
│ Date received ┆ Product                           ┆ input_col                         │
│ ---           ┆ ---                               ┆ ---                               │
│ str           ┆ str                               ┆ str                               │
╞═══════════════╪═══════════════════════════════════╪═══════════════════════════════════╡
│ 03/03/2018    ┆ Credit reporting, credit repair … ┆ On XX XX  item showed credit rep… │
│ 01/02/2019    ┆ Debt collection                   ┆ Saw credit report collection    … │
│ 01/08/2019    ┆ Credit reporting, credit repair … ┆ I public record   Child Support … │
│ 12/19/2018    ┆ Credit reporting, credit repair … ┆ XXXX furnished ficticious deroga… │
│ 07/09/2018    ┆ Mortgage                          ┆ Complaint SUNTRUST MORTGAGE rega… │
└───────────────┴───────────────────────────────────┴─────────────────────────────────

In [40]:
df.columns

['Date received', 'Product', 'input_col', 'keywords']

### Option 1: Spacy (unigram monitoring)

In [52]:
stopwords = nlp.Defaults.stop_words

# Customize your stopwords list
stopwords.add('new_stopword'),
stopwords |= {"Afham","Farden"}

stopwords.remove('new_stopword'),
stopwords -= {"Afham","Farden"}

def get_keywords_using_spacy(text,    pos_tag = ['PROPN', 'ADJ', 'NOUN'] ):
    
    doc = nlp(text)

    # Set the hot words as the words with pos tag “PROPN“, “ADJ“, or “NOUN“. (POS tag list is customizable)
    keywords = ([token.text for token in doc if token not in stopwords if not token.is_punct if token.pos_ in pos_tag])
    
    return keywords

#idientify keywords per group
df = df.sample(n_samp).with_columns(keywords = pl.col('input_col').map_elements(get_keywords_using_spacy, strategy= 'thread_local'))
df = df.explode('keywords')

#format dates 
df = df.with_columns(date_recieved = pl.col('Date received').str.to_datetime("%m/%d/%Y"))
df = df.with_columns(year_month=pl.col('date_recieved').dt.strftime("%Y/%m"))

#generate datafarme of ngrams by date and product to plot / derive monitoring rules to define EMERINGING
plot_df = df.groupby(['year_month','Product','keywords']).count()
plot_df = plot_df.sort(by=['year_month','Product'])

#calculate %chagne in each group / ngram
plot_df = plot_df.sort(['year_month', 'Product']).with_columns([pl.col('count').pct_change().over(['year_month','Product']).alias('pct_chg')])
plot_df.head()

year_month,Product,keywords,count
str,str,str,u32
"""2015/04""","""Consumer Loan""","""practices""",2
"""2015/04""","""Consumer Loan""","""equity""",2
"""2015/04""","""Consumer Loan""","""re""",2
"""2015/04""","""Consumer Loan""","""WE""",2
"""2015/04""","""Consumer Loan""","""applicatin""",2
"""2015/04""","""Consumer Loan""","""call""",2
"""2015/04""","""Consumer Loan""","""USBank""",2
"""2015/04""","""Consumer Loan""","""loan""",8
"""2015/04""","""Consumer Loan""","""Unfair""",2
"""2015/04""","""Consumer Loan""","""House""",2


In [65]:
plot_df.filter((pl.col('Product')=='Consumer Loan'))

year_month,Product,keywords,count,pct_chg
str,str,str,u32,f64
"""2015/04""","""Consumer Loan""","""practices""",2,
"""2015/04""","""Consumer Loan""","""equity""",2,0.0
"""2015/04""","""Consumer Loan""","""re""",2,0.0
"""2015/04""","""Consumer Loan""","""WE""",2,0.0
"""2015/04""","""Consumer Loan""","""applicatin""",2,0.0
…,…,…,…,…
"""2016/12""","""Consumer Loan""","""CFPB""",8,1.0
"""2016/12""","""Consumer Loan""","""number""",4,-0.5
"""2016/12""","""Consumer Loan""","""due""",4,0.0
"""2016/12""","""Consumer Loan""","""settlement""",4,0.0


In [53]:
### Option 2: PyTextRank

#calculate standard deviation on frequency of each term by group
# Group by 'year_month' and 'Product' and calculate the percent change in 'count'

percent_change_df = plot_df.groupby(['year_month', 'Product']).pct_change('count').alias('percent_change')

# Show the resulting DataFrame
print(percent_change_df)
#failed attempt
#aggregate text from each group into a singel string to identify keyphrases 
# group_text = (df.group_by('Product').agg(pl.col('input_col').alias('product_text'))
#               ).with_columns(final_text = pl.col('product_text').list.join('.'))

### Option 2: PyTextRank