In [3]:
import pandas as pd
import re
import emoji
import contractions
import json
import nltk
from nltk.corpus import words

# Prepare external resources
nltk.download('words', quiet=True)
standard_words = set(words.words())

# Load slang dictionary
with open('../resource/slang.json', 'r', encoding='utf-8') as f:
    slang_dict = json.load(f)

def preprocess_text_to_df_with_ohe(text):
    # Step 1: Remove URLs
    url_pattern = r'http\S+|www\S+'
    has_url = int(bool(re.search(url_pattern, text)))
    text = re.sub(url_pattern, '', text)

    # Step 2: Remove Mentions
    mention_pattern = r'@\w+'
    has_mention = int(bool(re.search(mention_pattern, text)))
    text = re.sub(mention_pattern, '', text)

    # Step 3: Extract Hashtags
    hashtags = re.findall(r'#\w+', text)
    text = re.sub(r'#\w+', '', text)

    # Step 4: Convert Emojis
    text = emoji.demojize(text, language='en')
    text = re.sub(r':([a-zA-Z_]+):', r' \1 ', text)
    text = text.replace('_', ' ')
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 5: Expand Contractions
    text = contractions.fix(text)

    # Step 6: Remove special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Step 7: Normalize case
    text = text.lower().strip()

    # Step 8: Normalize slang
    words_list = text.split()
    normalized = [slang_dict.get(w, w) if w not in standard_words else w for w in words_list]
    text = ' '.join(normalized)

    # Step 9: One-hot encode hashtags
    hashtag_ohe = {tag.lower(): 1 for tag in hashtags}
    all_cols = set([tag.lower() for tag in hashtags])

    base_data = {
        'post_text': text,
        'URLs': has_url,
        'Mentions': has_mention,
    }

    # Combine base data with OHE tags
    final_data = {**base_data, **hashtag_ohe}
    return pd.DataFrame([final_data])


In [4]:
text = "It is Sunday I need a break so I am planning to go out. #A14 http://example.com"
df = preprocess_text_to_df_with_ohe(text)
print(df)


                                           post_text  URLs  Mentions  #a14
0  it is sunday i need a break so i am planning t...     1         0     1


In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary resources are downloaded
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# POS tag converter
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default

def remove_stopword(tokens):
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    pos_tags = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

def tokenize_and_process_text(df):
    # Create a copy to avoid modifying original
    df = df.copy()

    # Tokenize
    df['tokens'] = df['post_text'].apply(lambda x: word_tokenize(x.lower()))

    # Remove stopwords
    df['processed_tokens'] = df['tokens'].apply(remove_stopword)

    # Lemmatize
    df['processed_tokens'] = df['processed_tokens'].apply(lemmatize_tokens)

    df = df.drop(['post_text', 'tokens'], axis=1)

    return df


In [17]:
# First: use preprocess_text_to_df_with_ohe() on a raw text
df_preprocessed = preprocess_text_to_df_with_ohe("It is Sunday I need a break so I am planning to go out. #A14 http://example.com")

# Then: apply this tokenizing and processing function
df_final = tokenize_and_process_text(df_preprocessed)

print(df_final)


   URLs  Mentions  #a14                 processed_tokens
0     1         0     1  [sunday, need, break, plan, go]
