In [4]:
# Nhập thư viện
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import matplotlib.cm as cm
from wordcloud import WordCloud

import warnings
warnings.filterwarnings('ignore')

In [10]:
# Đọc 2 bộ dự liệu để so sánh
org_corpus = pd.read_csv('data/mental_health.csv')

cln_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Original Corpus:\n", org_corpus.head(10), "\n")
print("Cleaned Corpus:\n", cln_corpus.head(10))

print(f"\nShape of original corpus: {org_corpus.shape} and cleaned corpus: {cln_corpus.shape}")
print(f"Sample difference: {org_corpus.shape[0] - cln_corpus.shape[0]}")

Original Corpus:
                                                 text  label
0  dear american teens question dutch person hear...      0
1  nothing look forward lifei dont many reasons k...      1
2  music recommendations im looking expand playli...      0
3  im done trying feel betterthe reason im still ...      1
4  worried  year old girl subject domestic physic...      1
5  hey rredflag sure right place post this goes  ...      1
6  feel like someone needs hear tonight feeling r...      0
7  deserve liveif died right noone would carei re...      1
8  feels good ive set dateim killing friday nice ...      1
9  live guiltok made stupid random choice  its ge...      1 

Cleaned Corpus:
                                                 text  label
0  dear american teen question dutch person heard...      0
1  nothing look forward life dont many reason kee...      1
2  music recommendation im looking expand playlis...      0
3  im done trying feel reason im still alive know...      1
4  

In [7]:
# So sánh unique tokens và total tokens
def tokenize(text):
    return text.split()

def token_stats(corpus, label_column='label'):
    total_tokens_class_1 = corpus[corpus[label_column] == 1]['text'].apply(lambda x: len(tokenize(x))).sum()
    total_tokens_class_0 = corpus[corpus[label_column] == 0]['text'].apply(lambda x: len(tokenize(x))).sum()
    
    unique_tokens_class_1 = set(token for text in corpus[corpus[label_column] == 1]['text'] for token in tokenize(text))
    unique_tokens_class_0 = set(token for text in corpus[corpus[label_column] == 0]['text'] for token in tokenize(text))

    total_tokens_entire = corpus['text'].apply(lambda x: len(tokenize(x))).sum()
    unique_tokens_entire = set(token for text in corpus['text'] for token in tokenize(text))

    return {
        'total_tokens_class_1': total_tokens_class_1,
        'total_tokens_class_0': total_tokens_class_0,
        'unique_tokens_class_1': len(unique_tokens_class_1),
        'unique_tokens_class_0': len(unique_tokens_class_0),
        'total_tokens_entire': total_tokens_entire,
        'unique_tokens_entire': len(unique_tokens_entire)
    }

original_stats = token_stats(org_corpus)

cleaned_stats = token_stats(cln_corpus)

print("Original Corpus Stats:")
print(f"Class 1: Total tokens = {original_stats['total_tokens_class_1']}, Unique tokens = {original_stats['unique_tokens_class_1']}")
print(f"Class 0: Total tokens = {original_stats['total_tokens_class_0']}, Unique tokens = {original_stats['unique_tokens_class_0']}")
print(f"Entire Corpus: Total tokens = {original_stats['total_tokens_entire']}, Unique tokens = {original_stats['unique_tokens_entire']}\n")

print("Cleaned Corpus Stats:")
print(f"Class 1: Total tokens = {cleaned_stats['total_tokens_class_1']}, Unique tokens = {cleaned_stats['unique_tokens_class_1']}")
print(f"Class 0: Total tokens = {cleaned_stats['total_tokens_class_0']}, Unique tokens = {cleaned_stats['unique_tokens_class_0']}")
print(f"Entire Corpus: Total tokens = {cleaned_stats['total_tokens_entire']}, Unique tokens = {cleaned_stats['unique_tokens_entire']}")

Original Corpus Stats:
Class 1: Total tokens = 1337600, Unique tokens = 42130
Class 0: Total tokens = 670013, Unique tokens = 49022
Entire Corpus: Total tokens = 2007613, Unique tokens = 72649

Cleaned Corpus Stats:
Class 1: Total tokens = 1212239, Unique tokens = 12516
Class 0: Total tokens = 584055, Unique tokens = 13883
Entire Corpus: Total tokens = 1796294, Unique tokens = 14599


In [15]:
# So sánh sự khác nhau của phân bổ lớp
def class_distribution_with_percentage(corpus, label_column='label'):
    class_counts = corpus[label_column].value_counts()
    total_count = class_counts.sum()
    percentages = (class_counts / total_count) * 100
    return class_counts, percentages

original_counts, original_percentages = class_distribution_with_percentage(org_corpus)

cleaned_counts, cleaned_percentages = class_distribution_with_percentage(cln_corpus)

print("Original Corpus Class Distribution:")
for label in original_counts.index:
    print(f"Class {label}: Count = {original_counts[label]}, Percentage = {original_percentages[label]:.1f}%")

print("\nCleaned Corpus Class Distribution:")
for label in cleaned_counts.index:
    print(f"Class {label}: Count = {cleaned_counts[label]}, Percentage = {cleaned_percentages[label]:.1f}%")

Original Corpus Class Distribution:
Class 0: Count = 14139, Percentage = 50.5%
Class 1: Count = 13838, Percentage = 49.5%

Cleaned Corpus Class Distribution:
Class 0: Count = 14122, Percentage = 50.5%
Class 1: Count = 13818, Percentage = 49.5%
