In [None]:
import pandas as pd
import os
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
path1 = "ADHD-comment.csv"
path2 = "ADHD.csv"

In [None]:
data1 = pd.read_csv(path1, encoding='latin1', on_bad_lines='skip', low_memory=False, nrows=1000000)
data2 = pd.read_csv(path2, encoding='latin1', on_bad_lines='skip', low_memory=False, nrows=1000000)

In [None]:
data1.head()

In [None]:
data2.head()

In [None]:
import nltk
import nltk.tokenize.punkt
#import nltk.tokenize.stopwords
#import nltk.tokenize.wordnet
print(nltk.data.path)
os.environ["DEFAULT_URL"] = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml'
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def preprocess_comments(data, text_column='body', score_column='score', test_size=0.2, random_state=42):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    X = data[text_column]
    Y = data[score_column]

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)

    num_samples_train = X_train.shape[0]
    num_samples_test = X_test.shape[0]

    print(f"num_samples_train = {num_samples_train}")
    print(f"num_samples_test = {num_samples_test}")

    def preprocess_text(series):
        tokenized = series.apply(lambda x: word_tokenize(str(x)))
        no_stopwords = tokenized.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation])
        lemmatized = no_stopwords.apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
        preprocessed = lemmatized.apply(lambda tokens: ' '.join(tokens))
        return preprocessed

    X_train_preprocessed = preprocess_text(X_train)
    X_test_preprocessed = preprocess_text(X_test)

    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train_preprocessed)
    X_test_vectorized = vectorizer.transform(X_test_preprocessed)

    print(X_train_vectorized.shape)
    print(X_test_vectorized.shape)

    X_combined = pd.concat([X_train_preprocessed, X_test_preprocessed])
    X_combined_vectorized = vectorizer.fit_transform(X_combined)

    print(f'Dimensions of the entire dataset: {X_combined_vectorized.shape}')

    return X_train_vectorized, X_test_vectorized, y_train, y_test, X_combined_vectorized, vectorizer


In [None]:
def preprocess_posts(data, text_column='selftext', score_column='score', test_size=0.2, random_state=42):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    X = data[text_column]
    Y = data[score_column]

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)

    num_samples_train = X_train.shape[0]
    num_samples_test = X_test.shape[0]

    print(f"num_samples_train = {num_samples_train}")
    print(f"num_samples_test = {num_samples_test}")

    def preprocess_text(series):
        tokenized = series.apply(lambda x: word_tokenize(str(x)))
        no_stopwords = tokenized.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation])
        lemmatized = no_stopwords.apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
        preprocessed = lemmatized.apply(lambda tokens: ' '.join(tokens))
        return preprocessed

    X_train_preprocessed = preprocess_text(X_train)
    X_test_preprocessed = preprocess_text(X_test)

    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train_preprocessed)
    X_test_vectorized = vectorizer.transform(X_test_preprocessed)

    print(X_train_vectorized.shape)
    print(X_test_vectorized.shape)

    X_combined = pd.concat([X_train_preprocessed, X_test_preprocessed])
    X_combined_vectorized = vectorizer.fit_transform(X_combined)

    print(f'Dimensions of the entire dataset: {X_combined_vectorized.shape}')

    return X_train_vectorized, X_test_vectorized, y_train, y_test, X_combined_vectorized, vectorizer


In [None]:
def calculate_word_score_correlation(X_vectorized, scores, vectorizer):
    words_df = pd.DataFrame(X_vectorized.toarray(), columns=vectorizer.get_feature_names_out())
    if isinstance(scores, pd.Series):
        words_df['score'] = scores.reset_index(drop=True)
    else:
        words_df['score'] = pd.Series(scores).reset_index(drop=True)
    correlations = words_df.corr()['score'].sort_values(ascending=False)
    return correlations


In [None]:
def plot_top_correlations(correlations, n=20):
    if isinstance(correlations, pd.Series):
        correlations = correlations.head(n)
    else:
        correlations = pd.Series(correlations).head(n)

    top_positive_correlations = correlations[correlations > 0]
    top_negative_correlations = correlations[correlations < 0]

    plt.figure(figsize=(12, 8))

    plt.subplot(1, 2, 1)
    sns.barplot(x=top_positive_correlations.values, y=top_positive_correlations.index)
    plt.title('Top Positive Word Correlations with Score')
    plt.xlabel('Correlation')


In [None]:
X_train_vectorized, X_test_vectorized, y_train, y_test, X_combined_vectorized, vectorizer = preprocess_comments(data1, 'body', 'score')
correlations = calculate_word_score_correlation(X_combined_vectorized, pd.concat([y_train, y_test]), vectorizer)
plot_top_correlations(correlations)


In [None]:
import nltk
print(nltk.data.path)

In [None]:
X_train_vectorized, X_test_vectorized, y_train, y_test, X_combined_vectorized, vectorizer = preprocess_posts(data2, 'selftext', 'score')
correlations = calculate_word_score_correlation(X_combined_vectorized, pd.concat([y_train, y_test]), vectorizer)
plot_top_correlations(correlations)


In [None]:
X_train_vectorized, X_test_vectorized, y_train, y_test, X_combined_vectorized, vectorizer = preprocess_posts(data3, 'selftext', 'score')
correlations = calculate_word_score_correlation(X_combined_vectorized, pd.concat([y_train, y_test]), vectorizer)
plot_top_correlations(correlations)


In [None]:
X_train_vectorized, X_test_vectorized, y_train, y_test, X_combined_vectorized, vectorizer = preprocess_comments(data4, 'body', 'score')
correlations = calculate_word_score_correlation(X_combined_vectorized, pd.concat([y_train, y_test]), vectorizer)
plot_top_correlations(correlations)


In [None]:
def create_word_cloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

def combine_text_data(data, text_column):
    combined_text = " ".join(data[text_column].dropna().tolist())
    return combined_text


In [None]:
combined_text1 = combine_text_data(data1, 'body')
create_word_cloud(combined_text1)

In [None]:
combined_text2 = combine_text_data(data2, 'selftext')
create_word_cloud(combined_text2)

In [None]:
combined_text3 = combine_text_data(data3, 'selftext')
create_word_cloud(combined_text3)

In [None]:
combined_text4 = combine_text_data(data4, 'body')
create_word_cloud(combined_text4)

In [None]:
def preprocess_text(text: str) -> str:
    text = re.sub(r"http\S+", "", text)
    text = re.sub("[^A-Za-z]+", " ", text)
    tokens = nltk.word_tokenize(text)
    tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
    text = " ".join(tokens)
    text = text.lower().strip()
    return text

In [None]:
data1['text_cleaned'] = data1['body'].apply(lambda text: preprocess_text(text))
data1 = data1[data1['text_cleaned'] != '']
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
X_tfidf = vectorizer.fit_transform(data1['text_cleaned']).toarray()

In [None]:
def eval_cluster(embedding):
    y_pred = kmeans.fit_predict(embedding)
    #ari = adjusted_rand_score(data1["target"], y_pred)
    #nmi = normalized_mutual_info_score(data1["target"], y_pred)
    #fmi = fowlkes_mallows_score(data1["target"], y_pred)

    #print("Adjusted Rand Index (ARI): {:.3f}".format(ari))
    #print("Normalized Mutual Information (NMI): {:.3f}".format(nmi))
    #print("Fowlkes-Mallows Index (FMI): {:.3f}".format(fmi))

kmeans = KMeans(n_clusters=3, random_state=42)

kmeans.fit(X_tfidf)

clusters = kmeans.labels_

data1['cluster_tfidf'] = clusters

eval_cluster(X_tfidf)


In [None]:
def dimension_reduction(embedding, method):
    pca = PCA(n_components=2, random_state=42)
    pca_vecs = pca.fit_transform(embedding)
    data1[f'x0_{method}'] = pca_vecs[:, 0]
    data1[f'x1_{method}'] = pca_vecs[:, 1]
dimension_reduction(X_tfidf, 'tfidf')


In [None]:
def plot_pca(x0_name, x1_name, cluster_name, method):
    plt.figure(figsize=(12, 7))
    plt.title(f"TF-IDF + KMeans Clustering with {method}", fontdict={"fontsize": 18})
    plt.xlabel("X0", fontdict={"fontsize": 16})
    plt.ylabel("X1", fontdict={"fontsize": 16})
    sns.scatterplot(data=data2, x=x0_name, y=x1_name, hue=cluster_name, palette="viridis")
    plt.show()

plot_pca('x0_tfidf', 'x1_tfidf', 'cluster_tfidf', 'TF-IDF')


In [None]:
# Calculate WCSS for different values of k
wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_tfidf)  # Assuming X_tfidf is your tf-idf matrix
    wcss.append(kmeans.inertia_)

# Plot the elbow method
plt.plot(range(1, 11), wcss, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.title('Elbow Method')
plt.show()


In [None]:
from sklearn.metrics import silhouette_score
sil_coeff = silhouette_score(X_tfidf, clusters)
print("Silhouette Coefficient:", round(sil_coeff, 3))

In [None]:
from sklearn.metrics import calinski_harabasz_score

# Calculate the Calinski-Harabasz Index
ch_index = calinski_harabasz_score(X_tfidf, clusters)
print("Calinski-Harabasz Index:", round(ch_index, 3))


In [None]:
from sklearn.metrics import davies_bouldin_score
clusters = kmeans.labels_
data1['cluster_tfidf'] = clusters
dbi = davies_bouldin_score(X_tfidf, clusters)
print("Davies-Bouldin Index:", round(dbi, 3))


### **LDA Topic Modelling**

In [None]:
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel


## Comments

In [None]:
def preprocess_for_lda(texts):
    # Initialize stopwords and lemmatizer
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    processed_texts = []

    for text in texts:
        if isinstance(text, str):  # Check if the input is a string
            # Tokenize, lowercase, and filter stopwords and non-alphanumeric tokens
            tokens = word_tokenize(text.lower())
            tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
            processed_texts.append(tokens)
        else:
            # Handle non-string values by appending an empty list or skipping
            processed_texts.append([])

    return processed_texts


In [None]:
from nltk.tokenize import word_tokenize

sample_text = "This is a simple test."
tokens = word_tokenize(sample_text)
print("Tokenized sample:", tokens)

In [None]:
texts = data1['body'].values.tolist()
processed_texts = preprocess_for_lda(texts)
id2word = corpora.Dictionary(processed_texts)

corpus = [id2word.doc2bow(text) for text in processed_texts]


In [None]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=4,
                     random_state=42,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}")


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

In [None]:
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel


In [None]:
def compute_coherence_values(corpus, dictionary, texts, limit, start=2, step=3, alpha_values=None, beta_values=None):
    coherence_values = []
    model_list = []

    for alpha in alpha_values:
        for beta in beta_values:
            for num_topics in range(start, limit, step):
                model = LdaModel(corpus=corpus,
                                 id2word=dictionary,
                                 num_topics=num_topics,
                                 random_state=100,
                                 chunksize=100,
                                 passes=10,
                                 alpha=alpha,
                                 eta=beta)

                model_list.append(model)
                coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
                coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


In [None]:
start = 2
limit = 10
step = 2

alpha_values = ['auto', 'asymmetric', 0.01, 0.31]
beta_values = ['auto', 'symmetric', 0.01, 0.31]

model_list, coherence_values = compute_coherence_values(corpus=corpus,
                                                        dictionary=id2word,
                                                        texts=processed_texts,
                                                        start=start,
                                                        limit=limit,
                                                        step=step,
                                                        alpha_values=alpha_values,
                                                        beta_values=beta_values)


In [None]:
# Additional Analysis
# Results by coherence score
# Wordclouds for clusters
# Coherence score for num_topic values from 1-10

In [None]:
num_topics_list = []
alpha_list = []
beta_list = []
coherence_score_list = []

for i, (alpha, beta) in enumerate(zip(alpha_values, beta_values)):
    for j, num_topics in enumerate(range(start, limit, step)):
        num_topics_list.append(num_topics)
        alpha_list.append(alpha)
        beta_list.append(beta)
        coherence_score_list.append(coherence_values[i * len(range(start, limit, step)) + j])

results_df = pd.DataFrame({
    'num_topics': num_topics_list,
    'alpha': alpha_list,
    'beta': beta_list,
    'coherence_score': coherence_score_list
})

sorted_df = results_df.sort_values(by='coherence_score', ascending=False)
sorted_df.reset_index(drop=True, inplace=True)

print(sorted_df)


In [None]:
for topic_idx in range(lda_model.num_topics):
    words = dict(lda_model.show_topic(topic_idx, topn=20))

    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(words)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud for Topic #{topic_idx + 1}")
    plt.show()


In [None]:
from gensim.models import CoherenceModel

topic_range = range(1, 11)

coherence_scores = []

for num_topics in topic_range:
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topics,
                         random_state=100,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=id2word, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()

    coherence_scores.append(coherence_score)
    print(f'Coherence Score for num_topics={num_topics}: {coherence_score}')

plt.figure(figsize=(10, 6))
plt.plot(topic_range, coherence_scores, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Coherence Score for Different Number of Topics')
plt.show()
