# Text Data Pre-Processing

In [74]:
# Loading required libraries 
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yolandapan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yolandapan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yolandapan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yolandapan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Processing the labeled data

In [75]:
# Load in the data
labeled = pd.read_excel('comments_final.xlsx', usecols=['Date', 'Text', 'final'])
labeled

Unnamed: 0,Date,Text,final
0,2024-11-25_143,ABSOLUTELY a SCAM I am at the doctor s of...,-1
1,2024-11-23_108,It s not special I promise you,-99
2,2024-11-22_129,I noticed Forward is currently hiring on Linke...,-99
3,2024-12-08_321,Yes gross taxable pretax investments lower gr...,-99
4,2024-12-17_211,Regardless this isn t a sign of a bad company...,-99
...,...,...,...
995,2024-12-08_227,And if you had jumped over the border illegall...,1
996,2024-11-20_126,I once passed on the sidewalk Ambulance was c...,1
997,2024-12-14_149,This is the opposite of the truth Subsidies ...,1
998,2024-12-08_268,Not really My mother was on this plan when sh...,1


In [76]:
labeled.dtypes

Date     object
Text     object
final     int64
dtype: object

In [77]:
# special characters have already been removed, need to handle remaining pieces of urls
handles = r'_\w+'
labeled['Text'] = labeled['Text'].str.replace(handles, '', regex=True)
one_letter_words = r'\b\w\b'
labeled['Text'] = labeled['Text'].str.replace(one_letter_words, '', regex=True)
words_of_one_letter = r'\b(\w)\1*\b'
labeled['Text'] = labeled['Text'].str.replace(words_of_one_letter, '', regex=True)
urls = r'\b(?:https?)\s\S*'  # Match http/https/www followed by non-space characters
labeled['Text'] = labeled['Text'].str.replace(urls, '', regex=True)
website = r'com'
labeled['Text'] = labeled['Text'].str.replace(website, '', regex=True)

In [78]:
flip_map = {
    'ǝ': 'e',
    'ʇ': 't',
    'ı': 'i',
    'ɹ': 'r',
    'ɔ': 'c',
    'ɥ': 'l',
    'ʌ': 'v',
    'ǝ':'e',
    'noʎ': 'y'}

# Function to replace flipped characters
def replace_flipped_chars(text):
    for flipped, normal in flip_map.items():
        text = text.replace(flipped, normal)
    return text
labeled['Text'] = labeled['Text'].apply(replace_flipped_chars)

In [79]:
# Remove stopwords and tokenize
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Filter out stopwords
    filtered_words = [word for word in words if word.lower() not in stop_words]
    
    # Join words back into a sentence
    return ' '.join(filtered_words)

# Apply the function to a DataFrame column (or individual text)
labeled['Text'] = labeled['Text'].apply(remove_stopwords)
labeled.iloc[0]['Text']

'ABSOLUTELY SCAM doctor office right even show credible gave ins Card'

In [80]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

# lemmatize words
lemmatizer = WordNetLemmatizer()

# Function to get the POS tag
def get_pos_tag(word):
    tag = pos_tag([word])[0][1]
    if tag.startswith('N'):
        return 'n'  # Noun
    elif tag.startswith('V'):
        return 'v'  # Verb
    elif tag.startswith('J'):
        return 'a'  # Adjective
    elif tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun if no match

# Function to lemmatize text with POS tagging
def lemmatize_text(text):
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Lemmatize each word with POS tagging
    lemmatized_words = [
        lemmatizer.lemmatize(word, get_pos_tag(word)) for word in words
    ]
    
    # Join words back into a sentence
    return ' '.join(lemmatized_words)

# Example DataFrame

# Apply the lemmatization function
labeled['Text'] = labeled['Text'].apply(lemmatize_text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/yolandapan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [81]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(labeled['Text'])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,ab,abandon,abd,abdicate,ability,abit,able,abroad,absolute,absolutely,...,youre,yourhealthidaho,youtu,yr,zenni,zero,zip,zofram,zone,ǝs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.328537,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.135178,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.196500,0.0,0.0,0.0,0.0


In [82]:
tfidf_df.to_csv("cleaned_labeled.csv", index=False)

In [83]:
# Check feature matrix of vectorized corpus
tfidf_df.shape
# 1000 posts, 3412 unique features

(1000, 3416)

In [84]:
# Check feature names
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['ab', 'abandon', 'abd', ..., 'zofram', 'zone', 'ǝs'], dtype=object)

In [85]:
import numpy as np

In [86]:
# Order features by frequency
def get_topn_features(X, feature_names, topn=10):
    """
    Inputs:
        X: feature matrix
        feature_names: extracted features during vectorization
        topn: the number of most frequent features to return
    Outputs:
        topn most frequent features and their frequency
    """
    feature_ct = np.asarray(np.sum(X, axis=0)).reshape(-1)

    feature_freq = []
    
    for i in np.argsort(feature_ct)[::-1][:topn]:
        feature_freq.append({'feature':feature_names[i], 'frequency':feature_ct[i]})
    
    return pd.DataFrame(feature_freq)

get_topn_features(tfidf_df, feature_names, topn=10)

Unnamed: 0,feature,frequency
0,please,41.853787
1,insurance,39.863921
2,plan,31.729651
3,get,26.718189
4,post,22.598333
5,thank,22.466427
6,question,21.735177
7,state,21.608627
8,go,21.577287
9,reddit,21.326011


In [87]:
# Order features by tfidf score

# Get the sum of the TF-IDF scores for each word (across all documents)
tfidf_sum = X.sum(axis=0).A1  # Flatten the matrix to a 1D array

# Create a DataFrame to easily inspect the words and their total TF-IDF scores
df = pd.DataFrame({'Word': feature_names, 'TF-IDF Sum': tfidf_sum})

# Sort by the sum of the TF-IDF scores (descending order)
df_sorted_by_tfidf = df.sort_values(by='TF-IDF Sum', ascending=False)

print(df_sorted_by_tfidf)

            Word  TF-IDF Sum
2246      please   41.853787
1513   insurance   39.863921
2234        plan   31.729651
1235         get   26.718189
2286        post   22.598333
...          ...         ...
764   descendant    0.046968
1203    fulltime    0.046968
2519      relies    0.046968
55      adoption    0.046968
3361    withheld    0.046968

[3416 rows x 2 columns]


### Processing the remaining data

In [88]:
# Load in the data
unlabeled = pd.read_csv('filtered_comments.csv')
unlabeled

Unnamed: 0,ID,Text
0,2024-11-20_1,If your previous providers used epic software ...
1,2024-11-20_2,So here s the thing If you claim your fiancee...
2,2024-11-20_3,I really don t have much advice to add as I ve...
3,2024-11-20_4,Well they made around k year but my dad is...
4,2024-11-20_5,I signed up for them too without doing researc...
...,...,...
6013,2024-12-17_277,No there s no legal requirement for an unpaid...
6014,2024-12-17_278,You are very alone in that opinion which is wh...
6015,2024-12-17_279,The same as any other post obamacare health in...
6016,2024-12-17_280,Thank you for your submission u AnythingNext...


In [89]:
unlabeled.size
# 12,036 comments

12036

In [21]:
unlabeled.dtypes

ID      object
Text    object
dtype: object

In [22]:
# special characters have already been removed, need to handle remaining pieces of urls
handles = r'_\w+'
unlabeled['Text'] = unlabeled['Text'].str.replace(handles, '', regex=True)
one_letter_words = r'\b\w\b'
unlabeled['Text'] = unlabeled['Text'].str.replace(one_letter_words, '', regex=True)
words_of_one_letter = r'\b(\w)\1*\b'
unlabeled['Text'] = unlabeled['Text'].str.replace(words_of_one_letter, '', regex=True)
urls = r'\b(?:https?)\s\S*'  # Match http/https/www followed by non-space characters
unlabeled['Text'] = unlabeled['Text'].str.replace(urls, '', regex=True)
website = r'com'
unlabeled['Text'] = unlabeled['Text'].str.replace(website, '', regex=True)

In [23]:
unlabeled['Text'] = unlabeled['Text'].apply(replace_flipped_chars)

In [24]:
import unicodedata

def normalize_text(text):
    return ''.join([unicodedata.normalize('NFKD', char) for char in text])
unlabeled['Text'] = unlabeled['Text'].apply(normalize_text)

In [25]:
unlabeled['Text'] = unlabeled['Text'].apply(remove_stopwords)

In [None]:
unlabeled['Text'] = unlabeled['Text'].apply(lemmatize_text)

In [None]:
vect = TfidfVectorizer()
Y = vect.fit_transform(unlabeled['Text'])
tfidf_df2 = pd.DataFrame(Y.toarray(), columns=vect.get_feature_names_out())
tfidf_df2

Unnamed: 0,aad,aarp,ab,aba,abalone,abandon,abb,abc,abcbb,abd,...,zipcode,zirconium,znome,zodiac,zolpidem,zoo,zucchini,zuchuu,zumbaand,ǝs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_df2.to_csv("cleaned_dataset.csv", index=False)

In [None]:
feature_names = vect.get_feature_names_out()
feature_names

array(['aad', 'aarp', 'ab', ..., 'zuchuu', 'zumbaand', 'ǝs'], dtype=object)

In [None]:
get_topn_features(tfidf_df2, feature_names, topn=10)

Unnamed: 0,feature,frequency
0,please,263.743131
1,insurance,223.342509
2,plan,178.73211
3,post,143.006348
4,question,142.123183
5,get,141.50761
6,state,135.64081
7,reddit,134.709868
8,healthinsurance,133.142612
9,solicitation,131.722852


## Preprocessing as a whole

In [90]:
labeled['Date'] = labeled['Date'].str.split('_').str[0]  # Keep YYYY-MM-DD
labeled = labeled.drop('ID', axis=1, errors='ignore')  # Drop ID if exists

unlabeled['Date'] = unlabeled['ID'].str.split('_').str[0]  # Keep YYYY-MM-DD
unlabeled = unlabeled.drop('ID', axis=1)


In [91]:
all_raw_texts = pd.concat([labeled, unlabeled])

In [92]:
all_raw_texts.head()

Unnamed: 0,Date,Text,final
0,2024-11-25,ABSOLUTELY SCAM doctor office right even show ...,-1.0
1,2024-11-23,special promise,-99.0
2,2024-11-22,notice Forward currently hire Linkedin make ze...,-99.0
3,2024-12-08,Yes gross taxable pretax investment low gross ...,-99.0
4,2024-12-17,Regardless sign bad pany could state require,-99.0


In [93]:
handles = r'_\w+'
all_raw_texts['Text'] = all_raw_texts['Text'].str.replace(handles, '', regex=True)
one_letter_words = r'\b\w\b'
all_raw_texts['Text'] = all_raw_texts['Text'].str.replace(one_letter_words, '', regex=True)
words_of_one_letter = r'\b(\w)\1*\b'
all_raw_texts['Text'] = all_raw_texts['Text'].str.replace(words_of_one_letter, '', regex=True)
urls = r'\b(?:https?)\s\S*'  # Match http/https/www followed by non-space characters
all_raw_texts['Text'] = all_raw_texts['Text'].str.replace(urls, '', regex=True)
website = r'com'
all_raw_texts['Text'] = all_raw_texts['Text'].str.replace(website, '', regex=True)

In [94]:
all_raw_texts['Text'] = all_raw_texts['Text'].apply(replace_flipped_chars)

In [95]:
all_raw_texts['Text'] = all_raw_texts['Text'].apply(remove_stopwords)

In [96]:
all_raw_texts['Text'] = all_raw_texts['Text'].apply(lemmatize_text)

In [97]:
vectorizer = TfidfVectorizer(min_df= 0.002,
                             max_df= 0.95,    
                             stop_words='english',
                             ngram_range=(1, 2))
X = vectorizer.fit_transform(all_raw_texts['Text'])
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,ability,able,absolutely,aca,aca marketplace,aca plan,aca pliant,aca subsidy,accept,accept medicaid,...,year ago,year old,year work,year year,yearly,yep,yes,york,young,zero
0,0.0,0.0,0.406282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
1,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
2,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.184591
3,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.441754,0.0,0.0,0.000000
4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7013,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
7014,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
7015,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
7016,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


In [71]:
tfidf_df.to_csv("cleaned_total_v2.csv", index=False)

In [72]:
len(tfidf_df)

7018

In [98]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['ability', 'able', 'absolutely', ..., 'york', 'young', 'zero'],
      dtype=object)

In [100]:
# Order features by frequency
def get_topn_features(X, feature_names, topn=10):
    """
    Inputs:
        X: feature matrix
        feature_names: extracted features during vectorization
        topn: the number of most frequent features to return
    Outputs:
        topn most frequent features and their frequency
    """
    feature_ct = np.asarray(np.sum(X, axis=0)).reshape(-1)

    feature_freq = []
    
    for i in np.argsort(feature_ct)[::-1][:topn]:
        feature_freq.append({'feature':feature_names[i], 'frequency':feature_ct[i]})
    
    return pd.DataFrame(feature_freq)

get_topn_features(tfidf_df, feature_names, topn=10)

Unnamed: 0,feature,frequency
0,insurance,286.359295
1,plan,225.630043
2,pay,161.106092
3,state,158.277136
4,post,157.722957
5,question,151.148457
6,reddit,142.961872
7,healthinsurance,138.359111
8,thank,138.218248
9,solicitation,136.587912


In [101]:
# Order features by tfidf score

# Get the sum of the TF-IDF scores for each word (across all documents)
tfidf_sum = X.sum(axis=0).A1  # Flatten the matrix to a 1D array

# Create a DataFrame to easily inspect the words and their total TF-IDF scores
df = pd.DataFrame({'Word': feature_names, 'TF-IDF Sum': tfidf_sum})

# Sort by the sum of the TF-IDF scores (descending order)
df_sorted_by_tfidf = df.sort_values(by='TF-IDF Sum', ascending=False)

print(df_sorted_by_tfidf)

                   Word  TF-IDF Sum
671           insurance  286.359295
1043               plan  225.630043
994                 pay  161.106092
1379              state  158.277136
1095               post  157.722957
...                 ...         ...
355      different plan    2.868763
834        medical plan    2.815636
685   insurance network    2.719281
1404    submission read    2.520426
1568          year work    2.453891

[1576 rows x 2 columns]
