In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import time

import warnings
warnings.filterwarnings('ignore')

cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Shape of Cleaned Corpus", cleaned_corpus.shape)

Shape of Cleaned Corpus (23240, 2)


In [2]:
vectorizer = TfidfVectorizer(max_features=3500)

# Fit TF-IDF vectorizer
X = vectorizer.fit_transform(cleaned_corpus['text'])

# Get samples from each class
class_0 = cleaned_corpus[cleaned_corpus['label'] == 0]
class_1 = cleaned_corpus[cleaned_corpus['label'] == 1]

# Filter samples by token length
mask_0 = class_0['text'].str.split().str.len().between(15, 25)
mask_1 = class_1['text'].str.split().str.len().between(15, 25)

samples_0 = class_0[mask_0].sample(n=5, random_state=42)
samples_1 = class_1[mask_1].sample(n=5, random_state=42)

samples = pd.concat([samples_0, samples_1])
feature_names = vectorizer.get_feature_names_out()
vector_length = len(feature_names)

for idx, row in samples.iterrows():
    print(f"Class {row['label']} sample:")
    print(f"{row['text']}\n")
    
    print("TF-IDF values:")
    sample_vector = vectorizer.transform([row['text']])
    vector_array = sample_vector.toarray()[0]
    nonzero_mask = vector_array != 0
    nonzero_values = pd.DataFrame({
        'Token': feature_names[nonzero_mask],
        'TF-IDF': vector_array[nonzero_mask]
    })
    nonzero_values = nonzero_values.sort_values('TF-IDF', ascending=False)
    print(nonzero_values.to_string(index=False))
    
    n_nonzero = np.count_nonzero(vector_array)
    n_zero = vector_length - n_nonzero
    print(f"\nVector statistics:")
    print(f"Non-zero values: {n_nonzero}")
    print(f"Zero values: {n_zero}")
    print(f"Total vector length: {vector_length}")
    print("\n" + "="*50 + "\n")

Class 0 sample:
recently ive eating stomach problem im getting really bad stomach pain im hungry appetite throw little food eat think eating disorder something im sure

TF-IDF values:
    Token   TF-IDF
  stomach 0.514720
   eating 0.434127
   hungry 0.286663
    throw 0.235316
 disorder 0.226459
       im 0.225463
     food 0.214009
      eat 0.211527
 recently 0.184122
  problem 0.158726
     pain 0.154161
     sure 0.152481
   little 0.150323
  getting 0.141145
      bad 0.135555
something 0.126122
      ive 0.106503
    think 0.105709
   really 0.104681

Vector statistics:
Non-zero values: 19
Zero values: 3481
Total vector length: 3500


Class 0 sample:
come rteenagers let people farm karma make post making fun karma get deleted karma ridiculous let meme ugh

TF-IDF values:
     Token   TF-IDF
     karma 0.730628
       let 0.266933
       ugh 0.264021
rteenagers 0.249057
ridiculous 0.238737
   deleted 0.238392
      meme 0.215462
       fun 0.159209
    making 0.144351
      post 

In [8]:
# Initialize TF-IDF Vectorizer and transform the text data
tfidf = TfidfVectorizer(max_features=3500)
tfidf_matrix = tfidf.fit_transform(cleaned_corpus['text'])

# Sum the TF-IDF scores for each term across all documents
tfidf_scores = np.array(tfidf_matrix.sum(axis=0)).flatten()

# Create a DataFrame to store terms with their summed scores
terms = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame({'term': terms, 'score': tfidf_scores})

# Sort the DataFrame to get top 30 highest and lowest scoring terms
top_30_highest = tfidf_df.nlargest(30, 'score')
top_30_lowest = tfidf_df.nsmallest(30, 'score')

print(top_30_highest)
print(top_30_lowest)

         term        score
1564       im  1379.222785
1812     like   901.153227
3355     want   872.305628
1167     feel   769.756417
1740     know   724.344765
1806     life   677.561486
1318      get   634.966980
2135      one   574.847310
450      cant   566.188210
2219   people   565.507400
3152     time   558.566592
1676      ive   555.463606
1270   friend   542.760670
908      dont   539.305194
2470   really   517.343020
3456    would   516.749624
1056     even   503.161687
3478     year   490.136575
767       day   489.172177
3119    think   479.044374
3118    thing   455.077405
1332       go   451.763167
1336    going   437.458096
1891     make   432.948721
1462     help   430.563804
2025     much   394.223357
2071    never   393.593957
1341     good   386.576097
2059     need   366.305285
1281  fucking   363.305309
            term     score
1883     maddock  0.708841
1400         haa  0.909182
542        clack  0.999677
113        ameno  1.000000
390        brice  1.016318
1

In [23]:
# List of highlighted terms
highlight_terms = ['suicidal', 'depression', 'suicide', 'kill', 'myself', 'die',
                   'died', 'pain', 'sad', 'help', 'sorry', 'anxiety', 'therapy',
                   'suffering', 'killing', 'pill',
                   'movie', 'film', 'character', 'story', 'actor', 'performance', 'show',
                   'plot', 'acting']

# Create a dictionary to store the top 5 most similar tokens for each highlighted term
highlight_similar_tokens = {}

# Check if each highlighted term exists in the tokens and compute similarities if present
for term in highlight_terms:
    if term in tokens:
        idx = list(tokens).index(term)
        # Get similarities for the term with all other tokens
        term_similarities = similarity_matrix[idx]

        # Sort indices of similar tokens in descending order of similarity
        similar_indices = term_similarities.argsort()[::-1]

        # Select the top 5 tokens, skipping the first as it will be the term itself (similarity = 1.0)
        top_5_similar = [tokens[i] for i in similar_indices[1:6]]
        
        # Store the top 5 similar tokens in the dictionary
        highlight_similar_tokens[term] = top_5_similar
    else:
        highlight_similar_tokens[term] = ["Token not found in vocabulary"]

# Display the results
for term, similar_tokens in highlight_similar_tokens.items():
    print(f"Term: {term} -> Top 5 co-occurence tokens: {similar_tokens}")

Term: suicidal -> Top 5 co-occurence tokens: ['thought', 'know', 'im', 'ideation', 'feel']
Term: depression -> Top 5 co-occurence tokens: ['anxiety', 'feel', 'year', 'life', 'im']
Term: suicide -> Top 5 co-occurence tokens: ['cannot', 'hotline', 'commit', 'life', 'thought']
Term: kill -> Top 5 co-occurence tokens: ['want', 'im', 'going', 'life', 'know']
Term: myself -> Top 5 co-occurence tokens: ['kill', 'want', 'feel', 'pit', 'badly']
Term: die -> Top 5 co-occurence tokens: ['want', 'im', 'life', 'live', 'know']
Term: died -> Top 5 co-occurence tokens: ['dog', 'ago', 'year', 'life', 'dad']
Term: pain -> Top 5 co-occurence tokens: ['want', 'life', 'im', 'cant', 'much']
Term: sad -> Top 5 co-occurence tokens: ['feel', 'im', 'like', 'want', 'know']
Term: help -> Top 5 co-occurence tokens: ['need', 'please', 'know', 'im', 'get']
Term: sorry -> Top 5 co-occurence tokens: ['im', 'know', 'like', 'ive', 'time']
Term: anxiety -> Top 5 co-occurence tokens: ['depression', 'social', 'im', 'ive', 

In [6]:
import numpy as np
import random

# Set a random seed for reproducibility
random.seed(42)

# Filter documents by token length between 15 and 25
filtered_docs = cleaned_corpus[
    cleaned_corpus['text'].apply(lambda x: 15 <= len(x.split()) <= 25)
]

# Separate the filtered documents by class
class_0_docs = filtered_docs[filtered_docs['label'] == 0]
class_1_docs = filtered_docs[filtered_docs['label'] == 1]

# Randomly select 3 documents from each class
sample_class_0 = class_0_docs.sample(3, random_state=42)
sample_class_1 = class_1_docs.sample(3, random_state=42)

# Combine the samples
sampled_docs = pd.concat([sample_class_0, sample_class_1])

# Transform the text of all documents using the same TF-IDF vectorizer
X_all = vectorizer.transform(cleaned_corpus['text'])

# Calculate cosine similarity between the sampled documents and all documents
similarity_matrix_all = cosine_similarity(X_all)

# Store results in a dictionary
similar_docs = {}

# For each sampled document, find the top 3 most similar documents
for idx, row in sampled_docs.iterrows():
    # Get the similarity scores for the current document with all other documents
    similarities = similarity_matrix_all[idx]

    # Get the indices of the top 3 most similar documents (excluding the document itself)
    similar_indices = similarities.argsort()[::-1][1:4]

    # Store the results in the dictionary, including the label
    similar_docs[f"Label {row['label']}: {row['text']}"] = [
        f"Label {cleaned_corpus.iloc[i]['label']}: {cleaned_corpus.iloc[i]['text']}" 
        for i in similar_indices
    ]

# Display the results
for doc, similar in similar_docs.items():
    print(f"Sampled Document: {doc}")
    print("Top 3 Similar Documents:")
    for i, sim_doc in enumerate(similar, start=1):
        print(f"{i}. {sim_doc}")
    print("\n" + "-"*50 + "\n")

Sampled Document: Label 0: recently ive eating stomach problem im getting really bad stomach pain im hungry appetite throw little food eat think eating disorder something im sure
Top 3 Similar Documents:
1. Label 0: way im eating healthy im overweight want loose weight healthy able athletic thing stuff im overweight think whatever im eating still skinny people eat though might unknowingly eating little generally eat day breakfast small lunch maybe half cup leftover previous supper black tea honey let eat something unhealthy like piece chocolate supper eat little less regular serving whatever mom make ive week working time week havent expecting bad health effect dont think stomach lot thats probably normal someone calorie deficit maybe isnt idk need outsider opinion
2. Label 0: dont want eat im reason im never bothered eating feel like im gaining weight never bothered motivation go get proper food sometimes im hungry eat cookie thats dont eat breakfast sometimes eat lunch sometimes dinn

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the cleaned corpus
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

# Initialize the TF-IDF vectorizer with a maximum of 3500 features
vectorizer = TfidfVectorizer(max_features=3500)

# Fit and transform the text data to create the TF-IDF matrix
X_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])

# Apply Truncated SVD for LSA
n_components = 100  # You can adjust this based on your data
svd = TruncatedSVD(n_components=n_components)

# Fit the SVD model and transform the TF-IDF matrix
X_lsa = svd.fit_transform(X_tfidf)

# Compute cosine similarity for terms (features)
cosine_similarities_terms = cosine_similarity(X_tfidf.T)

# Highlighted terms to find similar terms for
highlight_terms = ['suicidal', 'depression', 'suicide', 'kill', 'myself', 'die',
                   'died', 'pain', 'sad', 'help', 'sorry', 'anxiety', 'therapy',
                   'suffering', 'killing', 'pill',
                   'movie', 'film', 'character', 'story', 'actor', 'performance', 'show',
                   'plot', 'acting']

# Function to display similar terms for highlighted terms
def display_highlighted_terms(highlight_terms, top_n=5):
    for term in highlight_terms:
        if term in vectorizer.get_feature_names_out():
            term_index = vectorizer.get_feature_names_out().tolist().index(term)
            print(f"\nSimilar terms to '{term}':")
            similar_indices = np.argsort(cosine_similarities_terms[term_index])[::-1][1:top_n+1]
            for i in similar_indices:
                print(f"Term: {vectorizer.get_feature_names_out()[i]}, Similarity Score: {cosine_similarities_terms[term_index][i]:.4f}")
        else:
            print(f"Term '{term}' not found in the TF-IDF features.")

# Display similar terms for the specified highlight terms
display_highlighted_terms(highlight_terms, top_n=5)



Similar terms to 'suicidal':
Term: thought, Similarity Score: 0.3086
Term: know, Similarity Score: 0.1934
Term: im, Similarity Score: 0.1856
Term: ideation, Similarity Score: 0.1846
Term: feel, Similarity Score: 0.1770

Similar terms to 'depression':
Term: anxiety, Similarity Score: 0.2359
Term: feel, Similarity Score: 0.2037
Term: year, Similarity Score: 0.1991
Term: life, Similarity Score: 0.1908
Term: im, Similarity Score: 0.1825

Similar terms to 'suicide':
Term: cannot, Similarity Score: 0.1240
Term: hotline, Similarity Score: 0.0874
Term: commit, Similarity Score: 0.0751
Term: life, Similarity Score: 0.0658
Term: thought, Similarity Score: 0.0654

Similar terms to 'kill':
Term: want, Similarity Score: 0.2724
Term: im, Similarity Score: 0.2333
Term: going, Similarity Score: 0.2116
Term: life, Similarity Score: 0.2023
Term: know, Similarity Score: 0.1876

Similar terms to 'myself':
Term: kill, Similarity Score: 0.1716
Term: want, Similarity Score: 0.0752
Term: feel, Similarity Sco