In [1]:
import pandas as pd
import re
import emoji
import contractions
import json
import nltk
from nltk.corpus import words

nltk.download('words')
standard_words = set(words.words())

# Load slang dictionary
with open('../resource/slang.json', 'r', encoding='utf-8') as f:
    slang_dict = json.load(f)

def preprocess_twitter_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Drop unnecessary columns
    drop_cols = ['Unnamed: 0', 'post_id', 'post_created', 'user_id', 
                 'followers', 'friends', 'favourites', 'statuses', 'retweets']
    df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

    # Drop duplicates
    df = df.drop_duplicates(keep=False)

    # Step 1: Remove URLs and mark presence
    def remove_urls(text):
        pattern = r'http\S+|www\S+'
        return re.sub(pattern, '', text), int(bool(re.search(pattern, text)))
    df[['post_text', 'URLs']] = df['post_text'].apply(lambda x: pd.Series(remove_urls(x)))

    # Step 2: Remove Mentions and mark presence
    def remove_mentions(text):
        pattern = r'@\w+'
        return re.sub(pattern, '', text), int(bool(re.search(pattern, text)))
    df[['post_text', 'Mentions']] = df['post_text'].apply(lambda x: pd.Series(remove_mentions(x)))

    # Step 3: Extract Hashtags and remove from text
    def extract_hashtags(text):
        return re.findall(r'#\w+', text)
    df['Hashtags'] = df['post_text'].apply(extract_hashtags)
    df['post_text'] = df['post_text'].apply(lambda x: re.sub(r'#\w+', '', x))

    # Step 4: Convert emojis
    def convert_emojis(text):
        text = emoji.demojize(text)
        text = re.sub(r':([a-zA-Z_]+):', r' \1 ', text)
        return re.sub(r'\s+', ' ', text.replace('_', ' ')).strip()
    df['post_text'] = df['post_text'].apply(convert_emojis)

    # Step 5: Expand contractions
    df['post_text'] = df['post_text'].apply(contractions.fix)

    # Step 6: Remove special characters and lowercase
    df['post_text'] = df['post_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

    # Step 7: Normalize slang
    def normalize_slang(text):
        return ' '.join([slang_dict.get(w, w) if w.lower() not in standard_words else w 
                         for w in text.split()])
    df['post_text'] = df['post_text'].apply(normalize_slang)

    # Step 8: One-hot encode hashtags
    all_hashtags = list(set(h for tags in df['Hashtags'] for h in tags))
    for tag in all_hashtags:
        clean_tag = tag[1:]  # remove '#'
        df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))

    # Drop Hashtags column
    df = df.drop(columns=['Hashtags'])

    # Ensure label is last column if it exists
    if 'label' in df.columns:
        label_col = df.pop('label')
        df['label'] = label_col

    return df


[nltk_data] Downloading package words to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [24]:
df = pd.read_csv('../resource/Mental-Health-Twitter.csv')
df_cleaned = preprocess_twitter_df(df)
# df_cleaned.to_csv('../resource/Mental-Health-Twitter-Preprocessed.csv', index=False)
df_cleaned

  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df[f'hashtag_{clean_tag}'] = df['Hashtags'].apply(lambda tags: int(tag in tags))
  df

Unnamed: 0,post_text,URLs,Mentions,hashtag_XA,hashtag_larrystylinson,hashtag_bookclubs,hashtag_lonely,hashtag_almostfriday,hashtag_WorldCatDay,hashtag_CBT,...,hashtag_5SOSonElvis,hashtag_therapists,hashtag_CoronationStreet,hashtag_shakeology,hashtag_MIA,hashtag_Hap,hashtag_coffe,hashtag_DejaVu,hashtag_encouraging,label
0,it is just over 2 years since i was diagnosed ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,it is sunday i need a break so i am planning t...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,awake but tired i need to sleep but my brain h...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,retweet real time bears make perfect gifts and...,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,it is hard to say whether packing lists are ma...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,a day without sunshine is like night,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,borens laws 1 when in charge ponder 2 when in ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,the flow chart is a most thoroughly oversold p...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19998,ships are safe in harbor but they were never m...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Ensure required resources are available
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Step 1: Tokenize
    df['tokens'] = df['post_text'].apply(lambda x: word_tokenize(str(x).lower()))

    # Step 2: Remove stopwords
    # df['processed_tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

    # Step 3: Lemmatize with POS tagging
    def get_wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize_tokens(tokens):
        tagged = pos_tag(tokens)
        return [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]

    df['processed_tokens'] = df['processed_tokens'].apply(lemmatize_tokens)
    
    # Drop columns 'post_text' and 'tokens'
    df = df.drop(['post_text', 'tokens'], axis=1)
    # print(df)

    # Reorder label to last column if exists
    if 'label' in df.columns:
        label = df.pop('label')
        df['label'] = label

    return df


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
df_tokenized = tokenize_and_lemmatize(df_cleaned)
df_tokenized.head()

Unnamed: 0,URLs,Mentions,hashtag_XA,hashtag_larrystylinson,hashtag_bookclubs,hashtag_lonely,hashtag_almostfriday,hashtag_WorldCatDay,hashtag_CBT,hashtag_mentally,...,hashtag_therapists,hashtag_CoronationStreet,hashtag_shakeology,hashtag_MIA,hashtag_Hap,hashtag_coffe,hashtag_DejaVu,hashtag_encouraging,processed_tokens,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[2, year, since, diagnose, today, take, moment...",1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[sunday, need, break, plan, spend, little, tim...",1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[awake, tire, need, sleep, brain, idea]",1
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[retweet, real, time, bear, make, perfect, gif...",1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[hard, say, whether, pack, list, make, life, e...",1


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def train_model(df_tokenized):
    df = df_tokenized.copy()

    # 1. Prepare features and target
    X = df['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    y = df['label']

    # 2. Split into train/val/test (80/10/10 split)
    df_train, df_temp = train_test_split(df, test_size=0.2, stratify=y, random_state=42)
    df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['label'], random_state=42)

    X_train = df_train['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    X_val = df_val['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))
    X_test = df_test['processed_tokens'].apply(lambda x: ' '.join(eval(x) if isinstance(x, str) else x))

    y_train = df_train['label']
    y_val = df_val['label']
    y_test = df_test['label']

    # 3. TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)

    # 4. Train the model (no numeric features)
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_tfidf, y_train)

    # 5. Validation report
    print("Validation Set Evaluation:")
    y_val_pred = model.predict(X_val_tfidf)
    print(classification_report(y_val, y_val_pred))

    # 6. Test report
    print("Test Set Evaluation:")
    y_test_pred = model.predict(X_test_tfidf)
    print(classification_report(y_test, y_test_pred))

    return model, vectorizer


In [11]:
df_tokenized

Unnamed: 0,URLs,Mentions,hashtag_XA,hashtag_larrystylinson,hashtag_bookclubs,hashtag_lonely,hashtag_almostfriday,hashtag_WorldCatDay,hashtag_CBT,hashtag_mentally,...,hashtag_therapists,hashtag_CoronationStreet,hashtag_shakeology,hashtag_MIA,hashtag_Hap,hashtag_coffe,hashtag_DejaVu,hashtag_encouraging,processed_tokens,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[2, year, since, diagnose, today, take, moment...",1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[sunday, need, break, plan, spend, little, tim...",1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[awake, tire, need, sleep, brain, idea]",1
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[retweet, real, time, bear, make, perfect, gif...",1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[hard, say, whether, pack, list, make, life, e...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[day, without, sunshine, like, night]",0
19996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[borens, law, 1, charge, ponder, 2, trouble, d...",0
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[flow, chart, thoroughly, oversold, piece, pro...",0
19998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[ship, safe, harbor, never, mean, stay]",0


In [12]:
model, vectorizer = train_model(df_tokenized)


Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.74      0.69      0.72       945
           1       0.72      0.77      0.74       985

    accuracy                           0.73      1930
   macro avg       0.73      0.73      0.73      1930
weighted avg       0.73      0.73      0.73      1930

Test Set Evaluation:
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       946
           1       0.70      0.74      0.72       985

    accuracy                           0.71      1931
   macro avg       0.71      0.71      0.71      1931
weighted avg       0.71      0.71      0.71      1931



In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

# Make sure these are downloaded once
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
# stop_words = set(stopwords.words('english'))

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_post(text):
    tokens = word_tokenize(text.lower())
    # tokens = [word for word in tokens if word not in stop_words]
    pos_tags = pos_tag(tokens)
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return ' '.join(lemmatized)

def predict_post(text, model, vectorizer):
    processed_text = preprocess_post(text)
    tfidf_features = vectorizer.transform([processed_text])
    prediction = model.predict(tfidf_features)[0]
    return prediction


In [70]:
text = "I love you"
label = predict_post(text, model, vectorizer)
print("Predicted label:", label)


Predicted label: 1
