In [12]:
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import matplotlib.cm as cm
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

original_corpus = pd.read_csv('data/mental_health.csv')
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Shape of Original Corpus", original_corpus.shape)
print("Shape of Cleaned Corpus", cleaned_corpus.shape)

Shape of Original Corpus (27977, 2)
Shape of Cleaned Corpus (23240, 2)


In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming cleaned_corpus is your DataFrame with a 'text' column
tfidf = TfidfVectorizer(max_features=3500, stop_words='english')
tfidf_matrix = tfidf.fit_transform(cleaned_corpus['text'])

# Get the feature names (tokens)
feature_names = tfidf.get_feature_names_out()

# Convert the TF-IDF matrix to a dense format and create a DataFrame
tfidf_dense = tfidf_matrix.todense()
tfidf_df = pd.DataFrame(tfidf_dense, columns=feature_names)

# Sum the scores for each token across all documents
tfidf_scores = tfidf_df.sum(axis=0)

# Create a DataFrame for better visualization
tokens_scores_df = pd.DataFrame(tfidf_scores, columns=['TF-IDF Score']).reset_index()
tokens_scores_df.columns = ['Token', 'TF-IDF Score']

# Sort by TF-IDF Score in descending order
tokens_scores_df = tokens_scores_df.sort_values(by='TF-IDF Score', ascending=False)

# Display the first 5 tokens
print("First 5 Tokens and Their TF-IDF Scores:")
print(tokens_scores_df.head(5))

# Display the last 30 tokens
print("\nLast 30 Tokens and Their TF-IDF Scores:")
print(tokens_scores_df.tail(30))

First 5 Tokens and Their TF-IDF Scores:
     Token  TF-IDF Score
1567    im   1464.219769
1820  like    956.284352
3363  want    931.238879
1170  feel    822.814840
1747  know    772.035204

Last 30 Tokens and Their TF-IDF Scores:
           Token  TF-IDF Score
1540    humorous      5.268003
100     allowing      5.261716
2206  passionate      5.259987
1252      foster      5.251934
195    associate      5.240051
211    attending      5.238587
222    authority      5.182334
1486  historical      5.134556
2583    revealed      5.133668
1238        ford      5.103191
763       darker      5.095850
1407     handled      5.094682
1689       jason      5.091025
2179   painfully      5.087388
123         anne      4.991235
2072     neglect      4.924122
94          alex      4.920948
1489   hitchcock      4.912864
3338   virtually      4.852788
2143       opera      4.764232
747      cynical      4.688350
3383     wealthy      4.652998
805     delivers      4.640286
1993  miniseries      4.5

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 1. Initialize TF-IDF with specified parameters
tfidf = TfidfVectorizer(max_features=2500, stop_words='english')

# 2. Create TF-IDF matrix from cleaned corpus
# Assuming the text column is 'text' - adjust if it's named differently
tfidf_matrix = tfidf.fit_transform(cleaned_corpus['text'])

# 3. Apply LSA (using TruncatedSVD)
n_components = 20  # You can adjust this number
lsa = TruncatedSVD(n_components=n_components, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

# 4. Analyze results
# Get feature names (words)
feature_names = tfidf.get_feature_names_out()

# Print explained variance ratio
print("Explained variance ratio:", lsa.explained_variance_ratio_)
print("Total explained variance:", sum(lsa.explained_variance_ratio_))

# 5. Extract and print top terms for each topic
def print_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-n_top_words-1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        
        print(f"\nTopic {topic_idx + 1}:")
        for feature, weight in zip(top_features, weights):
            print(f"{feature}: {weight:.3f}")

# Print top 10 words per topic
print_topics(lsa, feature_names, 10)

Explained variance ratio: [0.0121003  0.00976051 0.00711625 0.00629515 0.00584499 0.00539468
 0.00492761 0.0045552  0.00421667 0.00404742 0.00395734 0.00382405
 0.00343066 0.00337167 0.00324869 0.00318699 0.00314025 0.0029621
 0.00285895 0.00281437]
Total explained variance: 0.09705385773838596

Topic 1:
im: 0.364
want: 0.224
like: 0.216
feel: 0.212
know: 0.184
life: 0.175
ive: 0.153
dont: 0.139
friend: 0.136
time: 0.133

Topic 2:
movie: 0.505
film: 0.444
character: 0.136
great: 0.131
good: 0.130
story: 0.127
scene: 0.099
time: 0.082
seen: 0.078
watch: 0.076

Topic 3:
im: 0.734
movie: 0.146
film: 0.122
gonna: 0.102
ive: 0.083
going: 0.066
tired: 0.054
bored: 0.053
sorry: 0.050
ill: 0.048

Topic 4:
want: 0.503
dont: 0.306
movie: 0.246
die: 0.212
film: 0.211
fucking: 0.160
im: 0.146
anymore: 0.102
kill: 0.101
tired: 0.093

Topic 5:
like: 0.426
dont: 0.404
feel: 0.315
im: 0.130
know: 0.123
wanna: 0.119
girl: 0.114
talk: 0.108
guy: 0.107
movie: 0.104

Topic 6:
feel: 0.521
like: 0.283
im: 0

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the original and cleaned corpus
original_corpus = pd.read_csv('data/mental_health.csv')  # Replace with actual path
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

# Define a function to run GMM and evaluate metrics
def run_gmm_and_evaluate(corpus, corpus_name):
    # 1. Prepare the data with TF-IDF
    tfidf = TfidfVectorizer(max_features=2500, stop_words='english')
    tfidf_matrix = tfidf.fit_transform(corpus['text'])

    # 2. Apply GMM clustering
    n_clusters = 2  # Adjust as needed
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    corpus['cluster'] = gmm.fit_predict(tfidf_matrix.toarray())

    # 3. Use cluster labels as predicted labels
    # Assuming you have the true labels in the 'label' column
    true_labels = corpus['label']

    # 4. Calculate metrics
    accuracy = accuracy_score(true_labels, corpus['cluster'])
    precision = precision_score(true_labels, corpus['cluster'], average='weighted', zero_division=0)
    recall = recall_score(true_labels, corpus['cluster'], average='weighted', zero_division=0)
    f1 = f1_score(true_labels, corpus['cluster'], average='weighted', zero_division=0)

    # 5. Print metrics
    print(f"Metrics for {corpus_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}\n")

# Run GMM on the original corpus
run_gmm_and_evaluate(original_corpus, "Original Corpus")

# Run GMM on the cleaned corpus
run_gmm_and_evaluate(cleaned_corpus, "Cleaned Corpus")

Metrics for Original Corpus:
Accuracy: 0.2117
Precision: 0.2046
Recall: 0.2117
F1 Score: 0.2072

Metrics for Cleaned Corpus:
Accuracy: 0.2123
Precision: 0.2095
Recall: 0.2123
F1 Score: 0.2069



In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the original and cleaned corpus
original_corpus = pd.read_csv('data/mental_health.csv')  # Replace with actual path
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

# Define a function to run GMM and evaluate metrics
def run_gmm_and_evaluate(corpus, corpus_name):
    # 1. Prepare the data with TF-IDF
    tfidf = TfidfVectorizer(max_features=2500, stop_words='english')
    tfidf_matrix = tfidf.fit_transform(corpus['text'])

    # 2. Apply GMM clustering
    n_clusters = 2  # Adjust as needed
    gmm = GaussianMixture(n_components=n_clusters, random_state=42)
    corpus['cluster'] = gmm.fit_predict(tfidf_matrix.toarray())

    # 3. Swap the cluster predictions
    corpus['cluster'] = np.where(corpus['cluster'] == 0, 1, 0)  # Change 0 to 1 and 1 to 0

    # 4. Use swapped cluster labels as predicted labels
    true_labels = corpus['label']

    # 5. Calculate metrics
    accuracy = accuracy_score(true_labels, corpus['cluster'])
    precision = precision_score(true_labels, corpus['cluster'], average='weighted', zero_division=0)
    recall = recall_score(true_labels, corpus['cluster'], average='weighted', zero_division=0)
    f1 = f1_score(true_labels, corpus['cluster'], average='weighted', zero_division=0)

    # 6. Print metrics
    print(f"Metrics for {corpus_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}\n")

# Run GMM on the original corpus
run_gmm_and_evaluate(original_corpus, "Original Corpus")

# Run GMM on the cleaned corpus
run_gmm_and_evaluate(cleaned_corpus, "Cleaned Corpus")

Metrics for Original Corpus:
Accuracy: 0.7883
Precision: 0.7964
Recall: 0.7883
F1 Score: 0.7871

Metrics for Cleaned Corpus:
Accuracy: 0.7877
Precision: 0.7887
Recall: 0.7877
F1 Score: 0.7863

