In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import time

import warnings
warnings.filterwarnings('ignore')

cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Shape of Cleaned Corpus", cleaned_corpus.shape)

Shape of Cleaned Corpus (23240, 2)


In [13]:
vectorizer = TfidfVectorizer(max_features=3500)

# Fit TF-IDF vectorizer
X = vectorizer.fit_transform(cleaned_corpus['text'])

# Get samples from each class
class_0 = cleaned_corpus[cleaned_corpus['label'] == 0]
class_1 = cleaned_corpus[cleaned_corpus['label'] == 1]

# Filter samples by token length
mask_0 = class_0['text'].str.split().str.len().between(15, 25)
mask_1 = class_1['text'].str.split().str.len().between(15, 25)

samples_0 = class_0[mask_0].sample(n=5, random_state=42)
samples_1 = class_1[mask_1].sample(n=5, random_state=42)

samples = pd.concat([samples_0, samples_1])
feature_names = vectorizer.get_feature_names_out()
vector_length = len(feature_names)

for idx, row in samples.iterrows():
    print(f"Class {row['label']} sample:")
    print(f"{row['text']}\n")
    
    print("TF-IDF values:")
    sample_vector = vectorizer.transform([row['text']])
    vector_array = sample_vector.toarray()[0]
    nonzero_mask = vector_array != 0
    nonzero_values = pd.DataFrame({
        'Token': feature_names[nonzero_mask],
        'TF-IDF': vector_array[nonzero_mask]
    })
    nonzero_values = nonzero_values.sort_values('TF-IDF', ascending=False)
    print(nonzero_values.to_string(index=False))
    
    n_nonzero = np.count_nonzero(vector_array)
    n_zero = vector_length - n_nonzero
    print(f"\nVector statistics:")
    print(f"Non-zero values: {n_nonzero}")
    print(f"Zero values: {n_zero}")
    print(f"Total vector length: {vector_length}")
    print("\n" + "="*50 + "\n")

Class 0 sample:
recently ive eating stomach problem im getting really bad stomach pain im hungry appetite throw little food eat think eating disorder something im sure

TF-IDF values:
    Token   TF-IDF
  stomach 0.514720
   eating 0.434127
   hungry 0.286663
    throw 0.235316
 disorder 0.226459
       im 0.225463
     food 0.214009
      eat 0.211527
 recently 0.184122
  problem 0.158726
     pain 0.154161
     sure 0.152481
   little 0.150323
  getting 0.141145
      bad 0.135555
something 0.126122
      ive 0.106503
    think 0.105709
   really 0.104681

Vector statistics:
Non-zero values: 19
Zero values: 3481
Total vector length: 3500


Class 0 sample:
come rteenagers let people farm karma make post making fun karma get deleted karma ridiculous let meme ugh

TF-IDF values:
     Token   TF-IDF
     karma 0.730628
       let 0.266933
       ugh 0.264021
rteenagers 0.249057
ridiculous 0.238737
   deleted 0.238392
      meme 0.215462
       fun 0.159209
    making 0.144351
      post 