## GOOGLE DRIVE IMPORT *DATASETS*

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

# Change the current working directory to 'transfer_learning'
os.chdir('/content/drive/My Drive/transfer_learning')


## Distribution of sentiment labels in training data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Load the DataFrames
df1 = pd.read_csv('data/covid_data_labeled.csv')
df2 = pd.read_csv('data/mental_health_modified_data.csv')
df3 = pd.read_csv('data/drug_cleaned_data.csv')

# Function to prepare the data for visualization
def prepare_data(df):
    # Replace labels for consistency
    if 'label' in df.columns:
        df['label'] = df['label'].replace({0: 'negative', 2: 'positive', 1: 'neutral'})
    counts = df['label'].value_counts().reset_index()
    counts.columns = ['Labels', 'Count']
    return counts

# Prepare data
df1_counts = prepare_data(df1)
df2_counts = prepare_data(df2)
df3_counts = prepare_data(df3)

# Set up the figure and axes
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5))

# Custom function to draw bar plots on the given axis
def plot_on_axis(ax, data, title):
    sns.barplot(x='Count', y='Labels', data=data, palette='pastel', ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Number of Samples (in thousands)')
    ax.set_ylabel('')
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: '{:,.0f}'.format(x/1000) + 'K'))

# Plot the data on respective axes
plot_on_axis(ax1, df1_counts, 'COVID Dataset')
plot_on_axis(ax2, df2_counts, 'Mental Dataset')
plot_on_axis(ax3, df3_counts, 'Drug Dataset')

# Improve layout
plt.tight_layout()
plt.savefig("data_visualization.pdf", format='pdf', dpi=300)
plt.show()




In [None]:
# Print out the distributions
print("Distribution of COVID Data:")
print(df1_counts)
print("\nDistribution of Mental Health Data:")
print(df2_counts)
print("\nDistribution of Drug Data:")
print(df3_counts)

## ETL Pipeline

In this ETL (Extract, Transform, Load) section, the dataset is processed using a suite of text cleaning methods provided by the `DataPreprocessor` class. The following transformations are performed:

### Extracting
- The data is read from a CSV file.

### Transforming
Several transformations are applied to the text data:

   - Removing website links, usernames, special characters, and certain other characters from the text.
   - Simplifying the cleaned text by removing placeholders, punctuation marks, short words, and extra whitespace.
   - Tokenizing the text, removing stopwords, and lemmatizing the tokens.
   - Splitting hashtags and usernames.
   - Creating new columns for the various stages of cleaned text, reordering and renaming columns for clarity.

### Loading
- The preprocessed data, now ready for further analysis or machine learning tasks, is returned as a Pandas DataFrame.

This comprehensive ETL process aims to simplify and standardize the text data, preparing it for subsequent stages in our data pipeline.


In [None]:
!pip install unidecode
import pandas as pd
import re
import unidecode
import nltk
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer



class DataPreprocessor:
    """
    The DataPreprocessor class provides a suite of methods to preprocess text data,
    including cleaning the text, removing stopwords, splitting hashtags, and usernames.
    """

    # Regular expressions to match special characters, usernames, and website links
    SPECIAL_CHARS_REGEX = r"[\*\+'\/\(\)\]\[\_\|]"
    USERNAME_REGEX = r'@\w*'
    WEBSITE_REGEX = r'http\S*'

    # Create an instance of TweetTokenizer
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    @staticmethod
    def remove_usernames_weblinks_and_special_chars(text):
        """
        Replaces usernames and website links in the given text with placeholders,
        replaces "&amp" with empty string, special characters with spaces, and removes apostrophes.

        Args:
            text (str): The original text string.

        Returns:
            str: The cleaned text string.
        """
        # Replace usernames and website links with placeholders
        text = re.sub(DataPreprocessor.USERNAME_REGEX, 'USERNAME', text)
        text = re.sub(DataPreprocessor.WEBSITE_REGEX, 'WEBSITE', text)

        # Replace "&amp" with empty string
        text = text.replace("&amp", '')

        # Replace special characters with spaces
        text = re.sub(DataPreprocessor.SPECIAL_CHARS_REGEX, ' ', text)

        # Remove apostrophes
        text = text.replace("'", "")

        # Replace hyphens, commas, and ampersands with spaces
        text = re.sub(r"[-&,]", ' ', text)

        # Replace punctuation marks with periods
        text = re.sub(r"[:;?!]", '.', text)

        # Replace multiple periods with a single period
        text = re.sub(r'\.+', '.', text)

        # Replace multiple periods separated by spaces with a single period
        text = re.sub(r'\. \.+', '.', text)

        return text.strip()
    @staticmethod
    def simplify_text(text):
        """
        Simplifies the cleaned text by removing placeholders for usernames and website links,
        removing punctuation marks, and eliminating short words and extra whitespace.

        Args:
            text (str): The cleaned text string.

        Returns:
            str: The further simplified text string.
        """
        # Replace usernames and website links with empty strings
        text = re.sub("USERNAME", '', text)
        text = re.sub("WEBSITE", '', text)

        # Remove punctuation marks, short words, and extra whitespace
        text = re.sub(r"\b\w{1,2}\b", '', text)
        text = re.sub(r"\s\s+", ' ', text)
        text = text.translate(str.maketrans(
            '', '', '!"$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'))
        return text.strip()
    @staticmethod
    def eliminate_stopwords(tokens):
        """
        Removes the stopwords from the given list of tokens and lemmatizes the words for normalization.

        Args:
            tokens (list): A list of word tokens.

        Returns:
            list: A list of tokens with stopwords removed.
        """
        stop = stopwords.words('english')
        stop.extend(['from', 'subject', 're', 'edu', 'use', 'via', 'like', 'ha'])
        lemmatizer = WordNetLemmatizer()
        new_tokens = []

        for token in tokens:
            # Handle contractions
            if "'" in token:
                parts = token.split("'")
                if parts[1].lower() == "t":
                    token = parts[0] + " not"
                elif parts[1].lower() == "ve":
                    token = parts[0] + " have"

            # Lemmatize token
            lemma = lemmatizer.lemmatize(token)

            # Check if token is a stopword or lemma
            if lemma not in stop:
                new_tokens.append(lemma)

        return new_tokens
    @staticmethod
    def separate_hashtags_usernames(text):
        """
        Splits hashtags and usernames in the given text.

        Args:
            text (str): The text string containing hashtags and usernames.

        Returns:
            str: The text string with hashtags and usernames split.
        """
        tokens = text.split()
        for i in range(len(tokens)):
            if (tokens[i][0] == '#') or (tokens[i][0] == '@'):
                tokens[i] = tokens[i].replace('#', '')
                tokens[i] = tokens[i].replace('@', '')
                out = re.split(r'(?<=[a-z])(?=[A-Z])', tokens[i])
                tokens[i] = ' '.join(out)
        tokens = ' '.join(tokens)
        return tokens
    @classmethod
    def preprocess_dataset(cls, data_file, data_type):
        """
        Processes the whole dataset performing all of the cleaning and preprocessing operations on the text data.

        Args:
            data_file (str): The path to the CSV data file.
            data_type (str): The type of data file.

        Returns:
            pandas.DataFrame: The preprocessed DataFrame.
        """
        # Read in data and clean text column
        df = pd.read_csv(data_file, quotechar='"', encoding='utf-8')
        # # specificlaly for sentiment 140
        # df = pd.read_csv(data_file, encoding='latin', names=['polarity', 'id', 'date', 'query', 'user', 'text'])
        df['clean_text'] = df['text'].astype(str).apply(cls.remove_usernames_weblinks_and_special_chars)


        # Remove non-ASCII characters
        df['clean_text'] = df['clean_text'].apply(unidecode.unidecode)
        df['clean_text'] = df['clean_text'].apply(cls.separate_hashtags_usernames)
        # Create simplified text column without usernames, websites, punctuation, and short words
        df['clean_text_simple'] = df['clean_text'].apply(cls.simplify_text)

        # Tokenize text, remove stopwords and lemmatize, and untokenize
        df['tokens'] = df['clean_text_simple'].apply(cls.tokenizer.tokenize)
        df['tokens'] = df['tokens'].apply(cls.eliminate_stopwords)
        df['text_simple'] = df['tokens'].apply(' '.join)

        # Remove tokens column
        df.drop('tokens', axis=1, inplace=True)
        # Rename columns for clarity
        columns = {
            'text': 'original_text',
            'clean_text': 'clean_text_with_usernames_and_hashtags',
            'clean_text_simple': 'clean_text_without_usernames_and_hashtags',
            'text_simple': 'clean_text_without_usernames_hashtags_or_stopwords'
        }

        # if data_type == "label":
        #     columns['polarity'] = 'label'
        df = df.rename(columns=columns)

        # Reorder columns for readability
        cols_to_keep = ['original_text', 'clean_text_without_usernames_hashtags_or_stopwords']
        if data_type == "label":
            cols_to_keep.append('label')
        df = df[cols_to_keep]

        return df

    '''
    #Demo Usage
    preprocessor = DataPreprocessor()
    cleaned_data = preprocessor.preprocess_dataset(data_file='path_to_your_data.csv', data_type='your_type')

    '''


In [None]:
    preprocessor = DataPreprocessor()
    cleaned_data = preprocessor.preprocess_dataset(data_file='data/drug_proportional_renamed.csv', data_type='label')

In [None]:
cleaned_data.to_csv('data/aug_drug_cleaned_data.csv', index=False)

## Data Sampling, Train-Test Split, and Preparation

In this section, we perform several steps to prepare the dataset for model training and evaluation:

1. **Data Import**: We begin by reading the cleaned CSV file which was generated from the previous ETL process.
2. **Data Cleaning**: After importing the data, we remove rows that contain missing values in the 'label' column and other columns to ensure the quality of our dataset.
3. **Train-Test Split**: We split the dataset into a training set and a testing set while maintaining the proportion of classes in each set (stratified sampling). This split is set at 90% for the training set and 10% for the testing set.
4. **Data Transformation**: Finally, we convert the 'label' column into a categorical format by encoding it into a one-hot format, which is suitable for multi-class classification problems.

By the end of these steps, our dataset is ready for model training and testing.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import numpy as np



def generate_train_test(df, text_column, label_column, num_classes, test_size=0.1):
    # Drop rows with missing values in the label column
    df = df.dropna(subset=[label_column])

    # Drop remaining rows with missing values
    df.dropna(inplace=True)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        df[text_column].astype(str),
        df[label_column].astype(int),
        test_size=test_size,
        random_state=42
    )

    # Convert labels to one-hot vectors if there's more than one class
    if num_classes > 1:
        y_train = to_categorical(y_train, num_classes)
        y_test = to_categorical(y_test, num_classes)

    return X_train, X_test, y_train, y_test

# Load the DataFrames
df_covid = pd.read_csv('data/covid_cleaned_data.csv')
# Load mental health dataset and remap the labels
df_mental_health = pd.read_csv('data/mental_health_modified_data.csv')
# If the labels are 0 and 2, remap 2 to 1
df_mental_health["label"] = df_mental_health["label"].replace(2, 1)
df_drug = pd.read_csv('data/drug_cleaned_data.csv')
df_aug_drug = pd.read_csv('data/aug_drug_cleaned_data.csv')

# Generate training and testing data
X_train_covid, X_test_covid, y_train_covid, y_test_covid = generate_train_test(df_covid, "clean_text_without_usernames_hashtags_or_stopwords", "label", num_classes=3)
print(f"Training Size (COVID Data): {len(X_train_covid)}")

X_train_mental, X_test_mental, y_train_mental, y_test_mental = generate_train_test(df_mental_health, "clean_text_without_usernames_hashtags_or_stopwords", "label", num_classes=2)
print(f"Training Size (Mental Health Data): {len(X_train_mental)}")

X_train_drug, X_test_drug, y_train_drug, y_test_drug = generate_train_test(df_drug, "clean_text_without_usernames_hashtags_or_stopwords", "label", num_classes=3)
print(f"Training Size (Drug Data): {len(X_train_drug)}")






In [None]:
df_drug['label'].value_counts()

### Data Augmentation for drug dataset

In this segment, we execute a series of actions to enhance our textual data for more robust model performance:

- **Synonym Extraction**: Leveraging the WordNet database, the `get_synonyms()` function identifies synonyms for a given word, enhancing data diversity.

#### Text Augmentation Techniques:

  - `synonym_replacement()`: Alters random words in a text with their corresponding synonyms.
  - `random_insertion()`: Embeds random synonyms at unspecified positions within a text.
  - `random_swap()`: Interchanges random word pairs in a text.
  - `random_deletion()`: Randomly omits words from a text based on a predetermined probability.

- **Augmentation Application**: The `augment_text()` function selects a random augmentation technique and implements it, introducing controlled variability in our dataset.

- **Augmentation Volume Calculation**: The `calculate_augmented_counts()` function ascertains the required volume of augmented samples for each class, ensuring retention of the original class distribution.

- **Proportional Augmentation**: Using `proportional_augmentation()`, we achieve a dataset enriched with augmented text, preserving the original class distribution. In this context, the drug dataset (`df_drug`) is targeted for a quadrupled increase in size.

By the conclusion of these procedures, our dataset is enriched and diversified, standing ready for advanced modeling tasks.


In [None]:
import numpy as np
import pandas as pd
import random
import nltk
from nltk.corpus import wordnet

# Download WordNet data
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(words, n=5):
    new_words = words.copy()
    random_word_list = list(set([word for word in words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    sentence = ' '.join(new_words)
    return sentence

def random_insertion(words, n=3):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return ' '.join(new_words)

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1 and counter < 10:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
    if len(synonyms) >= 1:
        random_synonym = random.choice(list(synonyms))
        random_idx = random.randint(0, len(new_words)-1)
        new_words.insert(random_idx, random_synonym)

def random_swap(words, n=3):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return ' '.join(new_words)

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1 and counter < 3:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

def random_deletion(words, p=0.5):
    if len(words) == 1:
        return ' '.join(words)
    remaining = list(filter(lambda x: random.uniform(0, 1) > p, words))
    if len(remaining) == 0:
        return random.choice(words)
    else:
        return ' '.join(remaining)

def augment_text(text):
    methods = [synonym_replacement, random_insertion, random_swap, random_deletion]
    random_method = random.choice(methods)
    words = text.split(' ')
    return random_method(words)

def calculate_augmented_counts(original_counts, total_target):
    """
    Calculate the number of samples needed for each class to maintain the original proportion.
    """
    total_original = sum(original_counts.values)
    proportions = original_counts / total_original
    target_counts = (total_target * proportions).round().astype(int)
    return target_counts

def proportional_augmentation(df, original_text_col, label_col, total_target_size):
    augmented_texts = []
    augmented_labels = []

    original_counts = df[label_col].value_counts()
    target_counts = calculate_augmented_counts(original_counts, total_target_size)

    for label, target in target_counts.iteritems():
        samples_to_add = target - original_counts[label]

        if samples_to_add <= 0:
            continue

        class_texts = df[df[label_col] == label][original_text_col].values

        for _ in range(samples_to_add):
            sample_text = np.random.choice(class_texts)
            augmented_texts.append(augment_text(sample_text))
            augmented_labels.append(label)

    df_augmented = pd.DataFrame({
        original_text_col: augmented_texts + df[original_text_col].tolist(),
        label_col: augmented_labels + df[label_col].tolist()
    })

    return df_augmented

# Assuming df_drug is already loaded
total_target_size = len(df_drug) * 4

df_drug_proportional_augmented = proportional_augmentation(df_drug, "original_text", "label", total_target_size)

# Checking the shape of the augmented dataframe
print(f"Shape of the proportionally augmented dataframe: {df_drug_proportional_augmented.shape}")

# Checking the distribution of the classes
class_distribution_proportional = df_drug_proportional_augmented['label'].value_counts()
print("\nClass distribution in the proportionally augmented dataframe:")
print(class_distribution_proportional)


In [None]:
df_drug_proportional_augmented.head()

In [None]:
# Saving the proportionally augmented dataframe to CSV
df_drug_proportional_augmented.to_csv("data/drug_proportional_augmented.csv", index=False)


In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import itertools

def plot_combined_distributions_academic(df1, df2, text_column, label1="Original", label2="Augmented"):
    """Plots a comparative analysis of text distributions for two datasets.

    Parameters:
    - df1, df2: DataFrames containing the text data.
    - text_column: The name of the column in the DataFrames that contains the text data.
    - label1, label2: Labels for the datasets.

    Returns:
    - A set of three plots: sentence lengths, top 30 word frequencies, and top 30 bigram frequencies.
    """

    # Compute sentence lengths
    lengths1 = df1[text_column].apply(lambda x: len(x.split()))
    lengths2 = df2[text_column].apply(lambda x: len(x.split()))

    # Compute word frequencies
    words1 = list(itertools.chain(*df1[text_column].str.split().tolist()))
    words2 = list(itertools.chain(*df2[text_column].str.split().tolist()))

    word_freq1 = dict(Counter(words1).most_common(30))
    word_freq2 = dict(Counter(words2).most_common(30))

    # Compute bigram frequencies
    bigrams1 = [(words1[i], words1[i + 1]) for i in range(len(words1) - 1)]
    bigrams2 = [(words2[i], words2[i + 1]) for i in range(len(words2) - 1)]

    bigram_freq1 = dict(Counter(bigrams1).most_common(30))
    bigram_freq2 = dict(Counter(bigrams2).most_common(30))

    # Initialize figure and axes
    fig, axs = plt.subplots(1, 3, figsize=(20, 6))

    # Plot sentence length distribution
    axs[0].hist(lengths1, bins=50, color='blue', alpha=0.7, label=label1)
    axs[0].hist(lengths2, bins=50, color='red', alpha=0.5, label=label2)
    axs[0].set_title('Sentence Length Distribution')
    axs[0].set_xlabel('Number of Words')
    axs[0].set_ylabel('Number of Sentences')
    axs[0].legend()

    # Plot top 30 word frequencies
    top_words = list(set(list(word_freq1.keys()) + list(word_freq2.keys())))
    axs[1].bar(top_words, [word_freq1.get(word, 0) for word in top_words], color='blue', alpha=0.7, label=label1)
    axs[1].bar(top_words, [word_freq2.get(word, 0) for word in top_words], color='red', alpha=0.5, label=label2, bottom=[word_freq1.get(word, 0) for word in top_words])
    axs[1].set_title('Top 30 Word Frequencies')
    axs[1].tick_params(axis='x', rotation=90)
    axs[1].set_ylabel('Count')
    axs[1].legend()

    # Plot top 30 bigram frequencies
    top_bigrams = list(set(list(bigram_freq1.keys()) + list(bigram_freq2.keys())))
    axs[2].bar([' '.join(bigram) for bigram in top_bigrams], [bigram_freq1.get(bigram, 0) for bigram in top_bigrams], color='blue', alpha=0.7, label=label1)
    axs[2].bar([' '.join(bigram) for bigram in top_bigrams], [bigram_freq2.get(bigram, 0) for bigram in top_bigrams], color='red', alpha=0.5, label=label2, bottom=[bigram_freq1.get(bigram, 0) for bigram in top_bigrams])
    axs[2].set_title('Top 30 Bigram Frequencies')
    axs[2].tick_params(axis='x', rotation=90)
    axs[2].set_ylabel('Count')
    axs[2].legend()

    plt.tight_layout()
    plt.savefig("Comparative_Analysis_Original_vs_Augmented_Text_Distributions.pdf", format='pdf', dpi=300)
    plt.show()

# Invocation
plot_combined_distributions_academic(df_drug, df_drug_proportional_augmented, "original_text")




## Visualizing Text Data: Word Clouds

In this section, a Word Cloud analysis is performed on two versions of our text data:

1. **Clean Text:** This is the preprocessed text where usernames, hashtags, and stopwords have been removed.
2. **Original Text:** This is the unprocessed, raw text from the dataset.

The code concatenates all the text entries into large strings, creates `WordCloud` objects, and generates word clouds for each set of text data. Subsequently, it uses matplotlib to plot these word clouds, providing a comparative visual representation of the most frequent words in both versions of the text.

This visualization provides insights into the effect of text preprocessing on our data, highlighting the most frequently occurring words in both cleaned and original text.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load the DataFrames
df1 = pd.read_csv('data/covid_cleaned_data.csv')
df2 = pd.read_csv('data/mental_health_modified_data.csv')
df3 = pd.read_csv('data/drug_cleaned_data.csv')

dataframes = [df1, df2, df3]
titles = ['COVID Dataset', 'Mental Dataset', 'Drug Dataset']

def generate_wordcloud(data, column_name):
    long_string = ' '.join(data[column_name].astype(str))
    long_string = ' '.join(set(long_string.split(" ")))  # Remove duplicates
    wordcloud = WordCloud(colormap="Reds_r", background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
    wordcloud.generate(long_string)
    return wordcloud

# For the academic style
plt.style.use("seaborn-whitegrid")
plt.rcParams["font.family"] = "serif"

fig, axs = plt.subplots(len(dataframes), 2, figsize=(20, 5 * len(dataframes)))

# Increase space between subplots for better readability
fig.subplots_adjust(hspace=0.6, wspace=0.3)

for i, df in enumerate(dataframes):
    for column, column_name in zip([0, 1], ["clean_text_without_usernames_hashtags_or_stopwords", "original_text"]):
        wordcloud_obj = generate_wordcloud(df, column_name)
        axs[i][column].imshow(wordcloud_obj)
        axs[i][column].axis("off")

        # Set titles
        axs[i][column].set_title(f"{titles[i]} - {column_name.replace('_', ' ').title()} Column", fontsize=14)

# Set an overall title
fig.suptitle('Word Cloud Analysis of Different Datasets', fontsize=18, fontweight='bold', y=1.02)

plt.tight_layout()
plt.savefig("wordcloud_analysis_academic.png", dpi=300)  # Save as high-dpi PNG

# Print top N words from each word cloud
N = 10

for i, df in enumerate(dataframes):
    for column, column_name in zip([0, 1], ["clean_text_without_usernames_hashtags_or_stopwords", "original_text"]):
        wordcloud_obj = generate_wordcloud(df, column_name)
        word_freqs = wordcloud_obj.words_
        sorted_words = sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)

        print(f"\nTop {N} words for {titles[i]} - {column_name.replace('_', ' ').title()} Column:")
        for word, freq in sorted_words[:N]:
            print(f"{word}: {freq:.2f}")

plt.show()


## TFIDF - SUPPORT VECTOR MACHINE TRAINING

### One hot encoding to int for TFIDF-SVM

In [None]:
def convert_onehot_to_int(y_data):
    """
    Convert one-hot encoded labels to integer labels.

    Parameters:
    - y_data: Numpy array with either integer or one-hot encoded labels

    Returns:
    - Array with integer labels
    """
    return np.argmax(y_data, axis=1) if y_data.ndim > 1 else y_data

# Using the function for conversions:
y_train_covid_svm = convert_onehot_to_int(y_train_covid)
y_test_covid_svm = convert_onehot_to_int(y_test_covid)

y_train_mental_svm = convert_onehot_to_int(y_train_mental)
y_test_mental_svm = convert_onehot_to_int(y_test_mental)

y_train_drug_svm = convert_onehot_to_int(y_train_drug)
y_test_drug_svm = convert_onehot_to_int(y_test_drug)




In [None]:
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import io
import os
from PIL import Image

def train_and_evaluate_svm(X, y, dataset_name):
    log_dir = f'logs/{dataset_name}_v1'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    writer = tf.summary.create_file_writer(log_dir)

    # Splitting the data using StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Hyperparameters
        tfidf_max_features = 5000
        svm_kernel = 'linear'
        svm_C = 1
        svm_gamma = 'scale'

        # TF-IDF
        tfidf_vectorizer = TfidfVectorizer(max_features=tfidf_max_features)
        X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
        X_test_tfidf = tfidf_vectorizer.transform(X_test)

        # SVM
        clf = SVC(kernel=svm_kernel, C=svm_C, gamma=svm_gamma)
        clf.fit(X_train_tfidf, y_train)
        y_pred = clf.predict(X_test_tfidf)

        # Confusion Matrix Visualization
        conf_mat = confusion_matrix(y_test, y_pred)
        sns.set_style("whitegrid", {'axes.grid' : False})
        plt.figure(figsize=(10, 7))
        labels = [f'Class {i}' for i in range(np.max(y_train)+1)]
        heatmap = sns.heatmap(conf_mat, annot=True, fmt='g', cmap='Blues', xticklabels=labels, yticklabels=labels, cbar=False)
        heatmap.tick_params(labelsize=12)
        plt.title(f'Confusion Matrix - Fold {fold + 1}', fontsize=15)
        plt.xlabel('Predicted labels', fontsize=14)
        plt.ylabel('True labels', fontsize=14)

        # Save Confusion Matrix Image
        figures_dir = 'figures'
        if not os.path.exists(figures_dir):
            os.makedirs(figures_dir)
        fig_filename = os.path.join(figures_dir, f'confusion_matrix_{dataset_name}_fold_{fold + 1}.png')
        plt.savefig(fig_filename, format="png", bbox_inches='tight')
        print(f"Confusion matrix for fold {fold + 1} saved at {fig_filename}")

        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches='tight')
        buf.seek(0)
        image = Image.open(buf).convert("RGB")  # Convert to RGB
        image_np = np.array(image)
        plt.close()

            # Metrics Calculation
        is_binary = len(np.unique(np.concatenate([y_train, y_test]))) == 2
        if is_binary:
            precision = precision_score(y_test, y_pred, pos_label=1)
            recall = recall_score(y_test, y_pred, pos_label=1)
            f1 = f1_score(y_test, y_pred, pos_label=1)
        else:
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='micro')
            f1 = f1_score(y_test, y_pred, average='micro')
        accuracy = accuracy_score(y_test, y_pred)

        # Logging to TensorBoard
        with writer.as_default():
            tf.summary.image(f"confusion_matrix_fold_{fold + 1}", tf.convert_to_tensor(image_np)[tf.newaxis,...], step=fold + 1)
            tf.summary.scalar(f'Accuracy_fold_{fold + 1}', accuracy, step=fold + 1)
            tf.summary.scalar(f'Precision_fold_{fold + 1}', precision, step=fold + 1)
            tf.summary.scalar(f'Recall_fold_{fold + 1}', recall, step=fold + 1)
            tf.summary.scalar(f'F1_Score_fold_{fold + 1}', f1, step=fold + 1)
            hyperparams_str = "\n".join([f"{key}: {value}" for key, value in {
                'tfidf_max_features': tfidf_max_features,
                'svm_kernel': svm_kernel,
                'svm_C': svm_C,
                'svm_gamma': svm_gamma
            }.items()])
            tf.summary.text(f'Hyperparameters_fold_{fold + 1}', hyperparams_str, step=fold + 1)


        # Print Classification Report for Each Fold
        report = classification_report(y_test, y_pred, target_names=labels)
        print(f"\nClassification Report for {dataset_name} - Fold {fold + 1}:\n", report)

# # Combine training and test data, then call the function
# train_and_evaluate_svm(np.concatenate([X_train_covid, X_test_covid]), np.concatenate([y_train_covid_svm, y_test_covid_svm]), "covid")
# train_and_evaluate_svm(np.concatenate([X_train_mental, X_test_mental]), np.concatenate([y_train_mental_svm, y_test_mental_svm]), "mental_health")
# train_and_evaluate_svm(np.concatenate([X_train_drug, X_test_drug]), np.concatenate([y_train_drug_svm, y_test_drug_svm]), "drug")

train_and_evaluate_svm(np.concatenate([X_train_aug_drug, X_test_aug_drug]), np.concatenate([y_train_aug_drug_svm, y_test_aug_drug_svm]), "drug_aug")


In [None]:
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import io
import os
from PIL import Image

def train_and_evaluate_svm(X, y, dataset_name):
    log_dir = f'logs/{dataset_name}_countsvm'
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    writer = tf.summary.create_file_writer(log_dir)

    # Splitting the data using StratifiedKFold
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Tokenization
        count_vectorizer = CountVectorizer(max_features=5000)
        X_train_counts = count_vectorizer.fit_transform(X_train)
        X_test_counts = count_vectorizer.transform(X_test)

        # LinearSVC
        clf = LinearSVC(C=0.1)
        clf.fit(X_train_counts, y_train)
        y_pred = clf.predict(X_test_counts)
        # Confusion Matrix Visualization
        conf_mat = confusion_matrix(y_test, y_pred)
        sns.set_style("whitegrid", {'axes.grid' : False})
        plt.figure(figsize=(10, 7))
        labels = [f'Class {i}' for i in range(np.max(y_train)+1)]
        heatmap = sns.heatmap(conf_mat, annot=True, fmt='g', cmap='Blues', xticklabels=labels, yticklabels=labels, cbar=False)
        heatmap.tick_params(labelsize=12)
        plt.title(f'Confusion Matrix - Fold {fold + 1}', fontsize=15)
        plt.xlabel('Predicted labels', fontsize=14)
        plt.ylabel('True labels', fontsize=14)

        # Save Confusion Matrix Image
        figures_dir = 'figures'
        if not os.path.exists(figures_dir):
            os.makedirs(figures_dir)
        fig_filename = os.path.join(figures_dir, f'confusion_matrix_{dataset_name}_fold_{fold + 1}.png')
        plt.savefig(fig_filename, format="png", bbox_inches='tight')
        print(f"Confusion matrix for fold {fold + 1} saved at {fig_filename}")

        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches='tight')
        buf.seek(0)
        image = Image.open(buf).convert("RGB")  # Convert to RGB
        image_np = np.array(image)
        plt.close()

            # Metrics Calculation
        is_binary = len(np.unique(np.concatenate([y_train, y_test]))) == 2
        if is_binary:
            precision = precision_score(y_test, y_pred, pos_label=1)
            recall = recall_score(y_test, y_pred, pos_label=1)
            f1 = f1_score(y_test, y_pred, pos_label=1)
        else:
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='micro')
            f1 = f1_score(y_test, y_pred, average='micro')
        accuracy = accuracy_score(y_test, y_pred)

        # Logging to TensorBoard
        with writer.as_default():
            tf.summary.image(f"confusion_matrix_fold_{fold + 1}", tf.convert_to_tensor(image_np)[tf.newaxis,...], step=fold + 1)
            tf.summary.scalar(f'Accuracy_fold_{fold + 1}', accuracy, step=fold + 1)
            tf.summary.scalar(f'Precision_fold_{fold + 1}', precision, step=fold + 1)
            tf.summary.scalar(f'Recall_fold_{fold + 1}', recall, step=fold + 1)
            tf.summary.scalar(f'F1_Score_fold_{fold + 1}', f1, step=fold + 1)

        # Print Classification Report for Each Fold
        report = classification_report(y_test, y_pred, target_names=labels)
        print(f"\nClassification Report for {dataset_name} - Fold {fold + 1}:\n", report)



train_and_evaluate_svm(np.concatenate([X_train_covid, X_test_covid]), np.concatenate([y_train_covid_svm, y_test_covid_svm]), "covid")
train_and_evaluate_svm(np.concatenate([X_train_mental, X_test_mental]), np.concatenate([y_train_mental_svm, y_test_mental_svm]), "mental_health")
train_and_evaluate_svm(np.concatenate([X_train_drug, X_test_drug]), np.concatenate([y_train_drug_svm, y_test_drug_svm]), "drug")


# TrainEval_Word2Vec_BiGRU_LSTM_Attention_EarlyStop

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/


In [None]:
!pip install keras-self-attention


### AttentiveEmoGRU-LSTM Word2Vec

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Bidirectional, GRU, Dense, Dropout, LSTM
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, CSVLogger
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from gensim.models import FastText
import seaborn as sns
import matplotlib.pyplot as plt
import io
from datetime import datetime
from gensim.models import KeyedVectors
from keras_self_attention import SeqWeightedAttention
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Conv1D, Bidirectional, GRU, Dense
from tensorflow.keras.optimizers import Adam
from keras_self_attention import SeqWeightedAttention
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from tensorflow.keras.layers import LSTM
import sys
import time
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import TensorBoard
import os
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import io


class Word2VecVectorizer:
    def __init__(self, word2vec_path):
        self.embedding_dim = 300  # GoogleNews-vectors-negative300.bin has an embedding size of 300
        self.model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

    def get_vector(self, word):
        try:
            return self.model[word]
        except KeyError:
            return np.zeros(self.embedding_dim)



class TextDataPreprocessor:
    def __init__(self, max_sequence_length, max_num_words):
        self.tokenizer = Tokenizer(num_words=max_num_words)
        self.max_sequence_length = max_sequence_length

    def fit(self, texts):
        self.tokenizer.fit_on_texts(texts)

    def transform(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, maxlen=self.max_sequence_length)


class TextBiGRUModelBuilder:
    def __init__(self, sequence_length, num_words, embedding_dim, num_classes, word2vec_vectorizer,tokenizer, dropout_rate=0.1, gru_units=64):
        self.config = {
            "sequence_length": sequence_length,
            "num_words": num_words,
            "embedding_dim": embedding_dim,
            "num_classes": num_classes,
            "dropout_rate": dropout_rate,
            "gru_units": gru_units
        }
        self.word2vec_vectorizer = word2vec_vectorizer
        self.tokenizer = tokenizer


    def build_embedding_matrix(self):
        embedding_matrix = np.zeros((self.config["num_words"], self.config["embedding_dim"]))
        for word, i in self.tokenizer.word_index.items():
            if i < self.config["num_words"]:
                embedding_vector = self.word2vec_vectorizer.get_vector(word)
                # Since get_vector method in Word2VecVectorizer will always return an array (either the embedding or zeros), we don't need to check if it's None.
                embedding_matrix[i] = embedding_vector
        return embedding_matrix



    def build_model(self):
        embedding_matrix = self.build_embedding_matrix()

        sequence_input = Input(shape=(self.config["sequence_length"],), dtype='int32')
        embedding_layer = Embedding(input_dim=self.config["num_words"],
                                    output_dim=self.config["embedding_dim"],
                                    weights=[embedding_matrix],
                                    input_length=self.config["sequence_length"],
                                    trainable=False)(sequence_input)

        x = Bidirectional(GRU(self.config["gru_units"], return_sequences=True))(embedding_layer)
        x = Dropout(self.config["dropout_rate"])(x)
        x = Bidirectional(GRU(self.config["gru_units"], return_sequences=True))(x)
        x = Dropout(self.config["dropout_rate"])(x)
        x = Bidirectional(LSTM(self.config["gru_units"], return_sequences=True))(x)
        x = Dropout(self.config["dropout_rate"])(x)
        x, _ = SeqWeightedAttention(return_attention=True)(x)
        preds = Dense(self.config["num_classes"], activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        return model



def get_callbacks(fold_num, log_dir):
    return [
        EarlyStopping(monitor='val_loss', patience=5),
        TensorBoard(log_dir=log_dir, histogram_freq=1),
        ModelCheckpoint(f'best_model_fold_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    ]


def log_to_tensorboard(log_dir, epoch, confusion_mtx, class_labels, micro_f1, weighted_precision):
    figure = plt.figure(figsize=(10, 10))
    sns.heatmap(confusion_mtx, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=class_labels, yticklabels=class_labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close(figure)
    buf.seek(0)
    image = tf.image.decode_png(buf.getvalue(), channels=4)
    image = tf.expand_dims(image, 0)
    with tf.summary.create_file_writer(log_dir).as_default():
        tf.summary.image("Confusion Matrix", image, step=epoch)
        tf.summary.scalar("Micro F1 Score", micro_f1, step=epoch)
        tf.summary.scalar("Weighted Precision", weighted_precision, step=epoch)


def train_and_evaluate(X, y, num_classes, dataset_name="default_dataset"):


    print(f"Training for {dataset_name} started...")
    log_dir = f"logs/{dataset_name}_word2vec_" + datetime.now().strftime("%Y%m%d-%H%M%S")
    max_num_words = 5000
    y_labels = np.argmax(y, axis=1)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    word2vec_path = "GoogleNews-vectors-negative300.bin"



    for fold_num, (train_index, test_index) in enumerate(skf.split(X, y_labels), start=1):
        print(f"Processing Fold {fold_num} for {dataset_name}...")
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        X_word_seq_train = [text_to_word_sequence(doc) for doc in X_train]
        word2vec_vectorizer = Word2VecVectorizer(word2vec_path)

        data_preprocessor = TextDataPreprocessor(max_sequence_length=100, max_num_words=max_num_words)
        data_preprocessor.fit(X_train)
        X_train_prep, X_test_prep = data_preprocessor.transform(X_train), data_preprocessor.transform(X_test)

        model_builder = TextBiGRUModelBuilder(sequence_length=100, num_words=max_num_words, embedding_dim=300, num_classes=num_classes,
                                    word2vec_vectorizer=word2vec_vectorizer, tokenizer=data_preprocessor.tokenizer, dropout_rate=0.1, gru_units=64)

        model = model_builder.build_model()


        callbacks = get_callbacks(fold_num, log_dir)

        model.fit(X_train_prep, y_train, validation_data=(X_test_prep, y_test), epochs=20, batch_size=128, callbacks=callbacks)

        y_pred = model.predict(X_test_prep)
        y_pred_labels = np.argmax(y_pred, axis=1)

        confusion_mtx = confusion_matrix(y_labels[test_index], y_pred_labels)
        micro_f1 = f1_score(y_labels[test_index], y_pred_labels, average='micro')
        weighted_precision = precision_score(y_labels[test_index], y_pred_labels, average='weighted')

        log_to_tensorboard(log_dir, fold_num, confusion_mtx, list(range(num_classes)), micro_f1, weighted_precision)

    print(f"Training for {dataset_name} completed!")


In [None]:
train_and_evaluate(np.concatenate([X_train_covid, X_test_covid]), np.concatenate([y_train_covid, y_test_covid]), num_classes=3, dataset_name="covid_dataset")
train_and_evaluate(np.concatenate([X_train_mental, X_test_mental]), np.concatenate([y_train_mental, y_test_mental]), num_classes=2, dataset_name="mental_dataset")
train_and_evaluate(np.concatenate([X_train_drug, X_test_drug]), np.concatenate([y_train_drug, y_test_drug]), num_classes=3, dataset_name="drug_dataset")


### AttentiveEmoGRU-LSTM Fasttext

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Bidirectional, GRU, Dense, Dropout, LSTM
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, CSVLogger
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from gensim.models import FastText
import seaborn as sns
import matplotlib.pyplot as plt
import io
from datetime import datetime
from gensim.models import KeyedVectors
from keras_self_attention import SeqWeightedAttention
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Conv1D, Bidirectional, GRU, Dense
from tensorflow.keras.optimizers import Adam
from keras_self_attention import SeqWeightedAttention
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, log_loss
from tensorflow.keras.layers import LSTM
import sys
import time
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import TensorBoard
import os
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import io


class FastTextVectorizer:
    def __init__(self, fasttext_path):
        self.embedding_dim = 300  # Adjust based on the FastText model you're using, most common is 300
        self.model = KeyedVectors.load_word2vec_format(fasttext_path, binary=False)  # Set binary to False for .vec format

    def get_vector(self, word):
        try:
            return self.model[word]
        except KeyError:
            return np.zeros(self.embedding_dim)




class TextDataPreprocessor:
    def __init__(self, max_sequence_length, max_num_words):
        self.tokenizer = Tokenizer(num_words=max_num_words)
        self.max_sequence_length = max_sequence_length

    def fit(self, texts):
        self.tokenizer.fit_on_texts(texts)

    def transform(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, maxlen=self.max_sequence_length)


class TextBiGRUModelBuilder:
    def __init__(self, sequence_length, num_words, embedding_dim, num_classes, fasttext_vectorizer,tokenizer, dropout_rate=0.1, gru_units=64):
        self.config = {
            "sequence_length": sequence_length,
            "num_words": num_words,
            "embedding_dim": embedding_dim,
            "num_classes": num_classes,
            "dropout_rate": dropout_rate,
            "gru_units": gru_units
        }
        self.fasttext_vectorizer = fasttext_vectorizer
        self.tokenizer = tokenizer


    def build_embedding_matrix(self):
        embedding_matrix = np.zeros((self.config["num_words"], self.config["embedding_dim"]))
        for word, i in self.tokenizer.word_index.items():
            if i < self.config["num_words"]:
                embedding_vector = self.fasttext_vectorizer.get_vector(word)
                # Since get_vector method in Word2VecVectorizer will always return an array (either the embedding or zeros), we don't need to check if it's None.
                embedding_matrix[i] = embedding_vector
        return embedding_matrix



    def build_model(self):
        embedding_matrix = self.build_embedding_matrix()

        sequence_input = Input(shape=(self.config["sequence_length"],), dtype='int32')
        embedding_layer = Embedding(input_dim=self.config["num_words"],
                                    output_dim=self.config["embedding_dim"],
                                    weights=[embedding_matrix],
                                    input_length=self.config["sequence_length"],
                                    trainable=False)(sequence_input)

        x = Bidirectional(GRU(self.config["gru_units"], return_sequences=True))(embedding_layer)
        x = Dropout(self.config["dropout_rate"])(x)
        x = Bidirectional(GRU(self.config["gru_units"], return_sequences=True))(x)
        x = Dropout(self.config["dropout_rate"])(x)
        x = Bidirectional(LSTM(self.config["gru_units"], return_sequences=True))(x)
        x = Dropout(self.config["dropout_rate"])(x)
        x, _ = SeqWeightedAttention(return_attention=True)(x)
        preds = Dense(self.config["num_classes"], activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        return model



def get_callbacks(fold_num, log_dir):
    return [
        EarlyStopping(monitor='val_loss', patience=5),
        TensorBoard(log_dir=log_dir, histogram_freq=1),
        ModelCheckpoint(f'best_model_fold_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    ]


def log_to_tensorboard(log_dir, epoch, confusion_mtx, class_labels, micro_f1, weighted_precision):
    figure = plt.figure(figsize=(10, 10))
    sns.heatmap(confusion_mtx, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=class_labels, yticklabels=class_labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close(figure)
    buf.seek(0)
    image = tf.image.decode_png(buf.getvalue(), channels=4)
    image = tf.expand_dims(image, 0)
    with tf.summary.create_file_writer(log_dir).as_default():
        tf.summary.image("Confusion Matrix", image, step=epoch)
        tf.summary.scalar("Micro F1 Score", micro_f1, step=epoch)
        tf.summary.scalar("Weighted Precision", weighted_precision, step=epoch)


def train_and_evaluate(X, y, num_classes, dataset_name="default_dataset"):


    print(f"Training for {dataset_name} started...")
    log_dir = f"logs/{dataset_name}_fasttext_" + datetime.now().strftime("%Y%m%d-%H%M%S")
    max_num_words = 5000
    y_labels = np.argmax(y, axis=1)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fasttext_path = "cc.en.300.vec"




    for fold_num, (train_index, test_index) in enumerate(skf.split(X, y_labels), start=1):
        print(f"Processing Fold {fold_num} for {dataset_name}...fasttext")
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        X_word_seq_train = [text_to_word_sequence(doc) for doc in X_train]
        fasttext_vectorizer = FastTextVectorizer(fasttext_path)


        data_preprocessor = TextDataPreprocessor(max_sequence_length=100, max_num_words=max_num_words)
        data_preprocessor.fit(X_train)
        X_train_prep, X_test_prep = data_preprocessor.transform(X_train), data_preprocessor.transform(X_test)

        model_builder = TextBiGRUModelBuilder(sequence_length=100, num_words=max_num_words, embedding_dim=300, num_classes=num_classes,
                                    fasttext_vectorizer=fasttext_vectorizer, tokenizer=data_preprocessor.tokenizer, dropout_rate=0.1, gru_units=64)

        model = model_builder.build_model()


        callbacks = get_callbacks(fold_num, log_dir)

        model.fit(X_train_prep, y_train, validation_data=(X_test_prep, y_test), epochs=20, batch_size=128, callbacks=callbacks)

        y_pred = model.predict(X_test_prep)
        y_pred_labels = np.argmax(y_pred, axis=1)

        confusion_mtx = confusion_matrix(y_labels[test_index], y_pred_labels)
        micro_f1 = f1_score(y_labels[test_index], y_pred_labels, average='micro')
        weighted_precision = precision_score(y_labels[test_index], y_pred_labels, average='weighted')

        log_to_tensorboard(log_dir, fold_num, confusion_mtx, list(range(num_classes)), micro_f1, weighted_precision)

    print(f"Training for {dataset_name} completed!")


In [None]:
train_and_evaluate(np.concatenate([X_train_covid, X_test_covid]), np.concatenate([y_train_covid, y_test_covid]), num_classes=3, dataset_name="covid_dataset")
train_and_evaluate(np.concatenate([X_train_mental, X_test_mental]), np.concatenate([y_train_mental, y_test_mental]), num_classes=2, dataset_name="mental_dataset")
train_and_evaluate(np.concatenate([X_train_drug, X_test_drug]), np.concatenate([y_train_drug, y_test_drug]), num_classes=3, dataset_name="drug_dataset")


### AttentiveEmoGRU-LSTM GLOVE

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Bidirectional, GRU, Dense, Dropout, LSTM
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, CSVLogger
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score
from gensim.models import FastText
import seaborn as sns
import matplotlib.pyplot as plt
import io
from datetime import datetime

class GloveVectorizer:
    def __init__(self, glove_path, embedding_dim=100):
        self.embedding_dim = embedding_dim
        self.embeddings_index = self._load_glove(glove_path)

    def _load_glove(self, glove_path):
        embeddings_index = {}
        with open(glove_path, 'r', encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        return embeddings_index

    def get_vector(self, word):
        return self.embeddings_index.get(word, np.zeros(self.embedding_dim))


class TextDataPreprocessor:
    def __init__(self, max_sequence_length, max_num_words):
        self.tokenizer = Tokenizer(num_words=max_num_words)
        self.max_sequence_length = max_sequence_length

    def fit(self, texts):
        self.tokenizer.fit_on_texts(texts)

    def transform(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, maxlen=self.max_sequence_length)


class TextBiGRUModelBuilder:
    def __init__(self, sequence_length, num_words, embedding_dim, num_classes, glove_vectorizer,tokenizer, dropout_rate=0.1, gru_units=64):
        self.config = {
            "sequence_length": sequence_length,
            "num_words": num_words,
            "embedding_dim": embedding_dim,
            "num_classes": num_classes,
            "dropout_rate": dropout_rate,
            "gru_units": gru_units
        }
        self.glove_vectorizer = glove_vectorizer
        self.tokenizer = tokenizer


    def build_embedding_matrix(self):
        embedding_matrix = np.zeros((self.config["num_words"], self.config["embedding_dim"]))
        for word, i in self.tokenizer.word_index.items():
            if i < self.config["num_words"]:
                embedding_vector = self.glove_vectorizer.get_vector(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        return embedding_matrix


    def build_model(self):
        embedding_matrix = self.build_embedding_matrix()

        sequence_input = Input(shape=(self.config["sequence_length"],), dtype='int32')
        embedding_layer = Embedding(input_dim=self.config["num_words"],
                                    output_dim=self.config["embedding_dim"],
                                    weights=[embedding_matrix],
                                    input_length=self.config["sequence_length"],
                                    trainable=False)(sequence_input)

        x = Bidirectional(GRU(self.config["gru_units"], return_sequences=True))(embedding_layer)
        x = Dropout(self.config["dropout_rate"])(x)
        x = Bidirectional(GRU(self.config["gru_units"], return_sequences=True))(x)
        x = Dropout(self.config["dropout_rate"])(x)
        x = Bidirectional(LSTM(self.config["gru_units"], return_sequences=True))(x)
        x = Dropout(self.config["dropout_rate"])(x)
        x, _ = SeqWeightedAttention(return_attention=True)(x)
        preds = Dense(self.config["num_classes"], activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        return model



def get_callbacks(fold_num, log_dir):
    return [
        EarlyStopping(monitor='val_loss', patience=5),
        TensorBoard(log_dir=log_dir, histogram_freq=1),
        ModelCheckpoint(f'best_model_fold_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    ]


def log_to_tensorboard(log_dir, epoch, confusion_mtx, class_labels, micro_f1, weighted_precision):
    figure = plt.figure(figsize=(10, 10))
    sns.heatmap(confusion_mtx, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=class_labels, yticklabels=class_labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close(figure)
    buf.seek(0)
    image = tf.image.decode_png(buf.getvalue(), channels=4)
    image = tf.expand_dims(image, 0)
    with tf.summary.create_file_writer(log_dir).as_default():
        tf.summary.image("Confusion Matrix", image, step=epoch)
        tf.summary.scalar("Micro F1 Score", micro_f1, step=epoch)
        tf.summary.scalar("Weighted Precision", weighted_precision, step=epoch)


def train_and_evaluate(X, y, num_classes, dataset_name="default_dataset"):


    print(f"Training for {dataset_name} started...")
    log_dir = f"logs/{dataset_name}_glove_" + datetime.now().strftime("%Y%m%d-%H%M%S")
    max_num_words = 5000
    y_labels = np.argmax(y, axis=1)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    glove_path = "glove.6B.50d.txt"

    for fold_num, (train_index, test_index) in enumerate(skf.split(X, y_labels), start=1):
        print(f"Processing Fold {fold_num} for {dataset_name}...")
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        X_word_seq_train = [text_to_word_sequence(doc) for doc in X_train]
        glove_vectorizer = GloveVectorizer(glove_path, embedding_dim=50)

        data_preprocessor = TextDataPreprocessor(max_sequence_length=100, max_num_words=max_num_words)
        data_preprocessor.fit(X_train)
        X_train_prep, X_test_prep = data_preprocessor.transform(X_train), data_preprocessor.transform(X_test)

        model_builder = TextBiGRUModelBuilder(sequence_length=100, num_words=max_num_words, embedding_dim=50, num_classes=num_classes,
                                    glove_vectorizer=glove_vectorizer, tokenizer=data_preprocessor.tokenizer, dropout_rate=0.1, gru_units=64)

        model = model_builder.build_model()


        callbacks = get_callbacks(fold_num, log_dir)

        model.fit(X_train_prep, y_train, validation_data=(X_test_prep, y_test), epochs=20, batch_size=128, callbacks=callbacks)

        y_pred = model.predict(X_test_prep)
        y_pred_labels = np.argmax(y_pred, axis=1)

        confusion_mtx = confusion_matrix(y_labels[test_index], y_pred_labels)
        micro_f1 = f1_score(y_labels[test_index], y_pred_labels, average='micro')
        weighted_precision = precision_score(y_labels[test_index], y_pred_labels, average='weighted')

        log_to_tensorboard(log_dir, fold_num, confusion_mtx, list(range(num_classes)), micro_f1, weighted_precision)

    print(f"Training for {dataset_name} completed!")




In [None]:
train_and_evaluate(np.concatenate([X_train_covid, X_test_covid]), np.concatenate([y_train_covid, y_test_covid]), num_classes=3, dataset_name="covid_dataset")
train_and_evaluate(np.concatenate([X_train_mental, X_test_mental]), np.concatenate([y_train_mental, y_test_mental]), num_classes=2, dataset_name="mental_dataset")
train_and_evaluate(np.concatenate([X_train_drug, X_test_drug]), np.concatenate([y_train_drug, y_test_drug]), num_classes=3, dataset_name="drug_dataset")


## FINE_TUNING_TEST

# Shared Layer RoBERTa

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Prepare data
data = {
    'Fold': ['F1', 'F2', 'F3', 'F4', 'F5'] * 9,
    'Embedding': ['Word2Vec'] * 5 + ['FastText'] * 5 + ['Glove'] * 5 +
                 ['Word2Vec'] * 5 + ['FastText'] * 5 + ['Glove'] * 5 +
                 ['Word2Vec'] * 5 + ['FastText'] * 5 + ['Glove'] * 5,
    'Dataset': ['COVID'] * 15 + ['Mental'] * 15 + ['Drug Review'] * 15,
    'Micro F1': [
        0.796, 0.806, 0.806, 0.787, 0.807,
        0.773, 0.790, 0.793, 0.778, 0.778,
        0.731, 0.750, 0.757, 0.733, 0.748,
        0.909, 0.907, 0.900, 0.911, 0.909,
        0.912, 0.909, 0.909, 0.909, 0.909,
        0.896, 0.898, 0.898, 0.876, 0.902,
        0.693, 0.716, 0.701, 0.710, 0.713,
        0.729, 0.715, 0.695, 0.706, 0.706,
        0.715, 0.711, 0.721, 0.688, 0.678
    ],
    'Precision': [
        0.797, 0.805, 0.805, 0.786, 0.809,
        0.773, 0.789, 0.794, 0.779, 0.781,
        0.741, 0.751, 0.758, 0.734, 0.748,
        0.909, 0.907, 0.900, 0.911, 0.914,
        0.913, 0.909, 0.910, 0.910, 0.909,
        0.896, 0.898, 0.899, 0.905, 0.902,
        0.643, 0.662, 0.673, 0.671, 0.677,
        0.682, 0.683, 0.631, 0.633, 0.632,
        0.719, 0.660, 0.651, 0.618, 0.660
    ]
}

df = pd.DataFrame(data)
df['Fold'] = df['Fold'].str[1:].astype(int)

plt.rcParams['axes.titlepad'] = 20
plt.rcParams['axes.labelpad'] = 10

# Contrasting color palette
palette = ["#E63946", "#1D3557", "#2A9D8F"]  # Red, Dark Blue, and Teal

# Define line styles
line_styles = {
    'Word2Vec': '-',
    'FastText': '--',
    'Glove': '-.'
}

# Lists for datasets and metrics
datasets = ['COVID', 'Mental', 'Drug Review']
metrics = ['Micro F1', 'Precision']

# Create a 2x3 grid of subplots with a tighter figsize
fig, axes = plt.subplots(2, 3, figsize=(24, 11))

for j, dataset in enumerate(datasets):
    for i, metric in enumerate(metrics):
        ax = axes[i, j]
        subset_df = df[df['Dataset'] == dataset]

        # Loop over embedding types for polynomial regression
        for embedding in ['Word2Vec', 'FastText', 'Glove']:
            subset_embedding_df = subset_df[subset_df['Embedding'] == embedding]
            sns.lineplot(data=subset_embedding_df, x='Fold', y=metric, ax=ax,
                         label=embedding, color=palette[list(['Word2Vec', 'FastText', 'Glove']).index(embedding)],
                         linestyle=line_styles[embedding], linewidth=2.5)

            # Annotate data points
            prev_y = None
            for x, y in zip(subset_embedding_df['Fold'], subset_embedding_df[metric]):
                offset = 0.002
                if prev_y and y < prev_y:
                    position_y = y - offset
                else:
                    position_y = y + offset

                ax.text(x, position_y, f'{y:.3f}', color=palette[list(['Word2Vec', 'FastText', 'Glove']).index(embedding)], ha='center')
                prev_y = y

        # Common configurations
        ax.grid(True, which='both', linestyle='--', linewidth=0.5)
        ax.set_title(f'{dataset} - {metric}', fontsize=14)

        # Hide individual legends
        ax.legend().set_visible(False)

        # Adjust labels
        ax.set_xlabel(ax.get_xlabel(), fontsize=12)
        ax.set_ylabel(ax.get_ylabel(), fontsize=12)

        # Adjust ticks
        ax.tick_params(axis='both', which='major', labelsize=12)

lines, labels = ax.get_legend_handles_labels()
fig.legend(lines[:3], labels[:3], loc='lower center', bbox_to_anchor=(0.5, 1.05), fontsize=12, title='Embedding Type', ncol=3)

plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# New BERT data
data = {
    'Fold': ['F1', 'F2', 'F3', 'F4', 'F5'] * 3,
    'Dataset': ['COVID Dataset'] * 5 + ['Mental Dataset'] * 5 + ['Drug Dataset'] * 5,
    'Micro F1': [0.8385, 0.9066, 0.9430, 0.9539, 0.9578,
                 0.9410, 0.9778, 0.9845, 0.9852, 0.9859,
                 0.7150, 0.7367, 0.8939, 0.9110, 0.9318],
    'Weighted Precision': [0.8383, 0.9071, 0.9430, 0.9541, 0.9579,
                           0.9415, 0.9780, 0.9845, 0.9852, 0.9859,
                           0.5966, 0.7013, 0.8896, 0.9102, 0.9309]
}

df = pd.DataFrame(data)
df['Fold'] = df['Fold'].str[1:].astype(int)

# Define chart styles
plt.rcParams['axes.titlepad'] = 20
plt.rcParams['axes.labelpad'] = 10
palette = ["#E63946", "#1D3557", "#2A9D8F"]  # Red, Dark Blue, and Teal

# Lists for metrics
metrics = ['Micro F1', 'Weighted Precision']

# Create a 1x2 grid of subplots
fig, axes = plt.subplots(1, 2, figsize=(24, 6))

for i, metric in enumerate(metrics):
    ax = axes[i]

    sns.lineplot(data=df, x='Fold', y=metric, hue='Dataset', ax=ax, palette=palette, linewidth=2.5)

    # Annotate data points
    for idx, dataset in enumerate(['COVID Dataset', 'Mental Dataset', 'Drug Review Dataset']):
        subset_df = df[df['Dataset'] == dataset]
        prev_y = None
        for x, y in zip(subset_df['Fold'], subset_df[metric]):
            offset = 0.002
            if prev_y and y < prev_y:
                position_y = y - offset
            else:
                position_y = y + offset
            ax.text(x, position_y, f'{y:.3f}', color=palette[idx], ha='center', fontsize=9)
            prev_y = y

    # Common configurations
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)
    ax.set_title(f'{metric}', fontsize=14)

    # Adjust labels
    ax.set_xlabel(ax.get_xlabel(), fontsize=12)
    ax.set_ylabel(ax.get_ylabel(), fontsize=12)

    # Adjust ticks
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.legend(title='Dataset', fontsize=10, title_fontsize=12)

plt.tight_layout()
plt.show()


In [None]:
statistics = {
    "Mean": [],
    "Median": [],
    "Standard Deviation": [],
    "Variance": [],
    "Range": [],
    "IQR": [],
    "Skewness": [],
    "Kurtosis": [],
    "25th Percentile": [],
    "75th Percentile": [],
    "Coefficient of Variation": [],
    "Mode": [],
    "Count": []
}

embeddings = ['Word2Vec', 'FastText', 'Glove']
datasets = ['COVID', 'Mental', 'Drug Review']
metrics = ['Micro F1', 'Precision']

latex_output = ""

for metric in metrics:
    for dataset in datasets:
        for embedding in embeddings:
            subset = df[(df['Embedding'] == embedding) & (df['Dataset'] == dataset)][metric]

            statistics["Mean"].append(subset.mean())
            statistics["Median"].append(subset.median())
            statistics["Standard Deviation"].append(subset.std())
            statistics["Variance"].append(subset.var())
            statistics["Range"].append(subset.max() - subset.min())
            statistics["IQR"].append(subset.quantile(0.75) - subset.quantile(0.25))
            statistics["Skewness"].append(subset.skew())
            statistics["Kurtosis"].append(subset.kurt())
            statistics["25th Percentile"].append(subset.quantile(0.25))
            statistics["75th Percentile"].append(subset.quantile(0.75))
            statistics["Coefficient of Variation"].append((subset.std() / subset.mean()) * 100)
            statistics["Mode"].append(subset.mode().iloc[0])
            statistics["Count"].append(subset.count())

            # Generate LaTeX table entries for each dataset and embedding type
            latex_output += f"\\textbf{{{dataset} - {embedding} - {metric}}} \\\\ \\hline \n"
            for stat, value in statistics.items():
                latex_output += f"{stat} & {value[-1]:.3f} \\\\ \n"

            latex_output += "\\hline \n"

# Print the LaTeX table entries
print(latex_output)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming your dataframe df and other variables are already loaded

# Set up the matplotlib figure
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(14, 20))

# 1. Heatmap
heatmap_data = {
    "Word2Vec": [],
    "FastText": [],
    "Glove": []
}

for dataset in datasets:
    for embedding in embeddings:
        subset = df[(df['Embedding'] == embedding) & (df['Dataset'] == dataset)]['Micro F1']
        heatmap_data[embedding].append(subset.mean())

heatmap_df = pd.DataFrame(heatmap_data, index=datasets)
sns.heatmap(heatmap_df, cmap="YlGnBu", annot=True, ax=axes[0])
axes[0].set_title('Mean Micro F1 Score')

# 2. Bar Chart
dataset_of_interest = "COVID"  # or 'Mental' or 'Drug Review'
metric_of_interest = "Micro F1"

values = [df[(df['Embedding'] == embedding) & (df['Dataset'] == dataset_of_interest)][metric_of_interest].mean() for embedding in embeddings]
sns.barplot(x=embeddings, y=values, ax=axes[1])
axes[1].set_title(f'Mean {metric_of_interest} for {dataset_of_interest}')
axes[1].set_ylabel(metric_of_interest)

# 3. Boxplot
sns.boxplot(data=df, x="Embedding", y="Micro F1", hue="Dataset", ax=axes[2])
axes[2].set_title('Distribution of Micro F1 across Embeddings and Datasets')

plt.tight_layout()
plt.show()



# pip

In [None]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, confusion_matrix
from tensorflow.keras.utils import to_categorical
from transformers import TFBertModel, BertTokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, TensorBoard
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score
import tensorflow as tf
import io
from transformers import TFRobertaModel, RobertaTokenizer



roberta_model = TFRobertaModel.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')




def create_model(num_classes):
    input_ids = Input(shape=(100,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(100,), dtype=tf.int32, name='attention_mask')
    roberta_output = roberta_model([input_ids, attention_mask])[0]
    pooled_output = tf.keras.layers.GlobalAveragePooling1D()(roberta_output)
    predictions = Dense(num_classes, activation='softmax', name='output')(pooled_output)
    model = Model(inputs=[input_ids, attention_mask], outputs=predictions)

    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

    return model




def get_callbacks(fold_num, log_dir):
    return [
        EarlyStopping(monitor='val_loss', patience=5),
        TensorBoard(log_dir=log_dir, histogram_freq=1),
        ModelCheckpoint(f'best_model_fold_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
    ]

def log_to_tensorboard(log_dir, epoch, confusion_mtx, class_labels, micro_f1, weighted_precision):
    figure = plt.figure(figsize=(10, 10))
    sns.heatmap(confusion_mtx, annot=True, fmt="d", cmap=plt.cm.Blues, xticklabels=class_labels, yticklabels=class_labels)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close(figure)
    buf.seek(0)
    image = tf.image.decode_png(buf.getvalue(), channels=4)
    image = tf.expand_dims(image, 0)
    with tf.summary.create_file_writer(log_dir).as_default():
        tf.summary.image("Confusion Matrix", image, step=epoch)
        tf.summary.scalar("Micro F1 Score", micro_f1, step=epoch)
        tf.summary.scalar("Weighted Precision", weighted_precision, step=epoch)

def train_and_evaluate(X, y, num_classes, dataset_name="default_dataset"):


    print(f"Training for {dataset_name} started...")
    log_dir = f"logs/{dataset_name}_roberta"
    y_labels = np.argmax(y, axis=1)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for fold_num, (train_index, test_index) in enumerate(skf.split(X, y_labels), start=1):
        print(f"Processing Fold {fold_num} for {dataset_name}...")
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        X_train_text = X_train.tolist()
        X_test_text = X_test.tolist()

        train_encodings = tokenizer(X_train_text, truncation=True, padding='max_length', max_length=100, return_tensors='tf')
        test_encodings = tokenizer(X_test_text, truncation=True, padding='max_length', max_length=100, return_tensors='tf')


        X_train_bert = [np.array(train_encodings['input_ids']), np.array(train_encodings['attention_mask'])]
        X_test_bert = [np.array(test_encodings['input_ids']), np.array(test_encodings['attention_mask'])]

        callbacks = get_callbacks(fold_num, log_dir)
        model = create_model(num_classes)

        model.fit(X_train_bert, y_train, batch_size=64, epochs=3, validation_split=0.2, callbacks=callbacks)

        y_pred = model.predict(X_test_bert)

        y_pred_classes = np.argmax(y_pred, axis=1)
        y_test_classes = np.argmax(y_test, axis=1)

        confusion_mtx = confusion_matrix(y_test_classes, y_pred_classes)
        micro_f1 = f1_score(y_test_classes, y_pred_classes, average='micro')
        weighted_precision = precision_score(y_test_classes, y_pred_classes, average='weighted')

        log_to_tensorboard(log_dir, fold_num, confusion_mtx, list(range(num_classes)), micro_f1, weighted_precision)

    print(f"Training for {dataset_name} completed!")

train_and_evaluate(np.concatenate([X_train_covid, X_test_covid]), np.concatenate([y_train_covid, y_test_covid]), num_classes=3, dataset_name="covid_dataset")
train_and_evaluate(np.concatenate([X_train_mental, X_test_mental]), np.concatenate([y_train_mental, y_test_mental]), num_classes=2, dataset_name="mental_dataset")
train_and_evaluate(np.concatenate([X_train_drug, X_test_drug]), np.concatenate([y_train_drug, y_test_drug]), num_classes=3, dataset_name="drug_dataset")



In [None]:
# Prepare the data for each domain
X_train_text_domain1 = X_train_domain1.tolist()
X_train_text_domain2 = X_train_domain2.tolist()

train_encodings_domain1 = tokenizer(X_train_text_domain1, truncation=True, padding='max_length', max_length=100)
train_encodings_domain2 = tokenizer(X_train_text_domain2, truncation=True, padding='max_length', max_length=100)

X_train_bert_domain1 = [np.array(train_encodings_domain1['input_ids']), np.array(train_encodings_domain1['attention_mask'])]
X_train_bert_domain2 = [np.array(train_encodings_domain2['input_ids']), np.array(train_encodings_domain2['attention_mask'])]

# Train the model on each domain
for epoch in range(num_epochs):
    # Train on domain1
    history1 = new_model.fit(X_train_bert_domain1, y_train_domain1, batch_size=64, epochs=1, validation_split=0.2, callbacks=callbacks_list, shuffle=True)
    # Update the shared layer to use domain2
    new_model.get_layer('shared_layer').domain_index = 1
    # Train on domain2
    history2 = new_model.fit(X_train_bert_domain2, y_train_domain2, batch_size=64, epochs=1, validation_split=0.2, callbacks=callbacks_list, shuffle=True)
    # Update the shared layer to use domain1 for the next epoch
    new_model.get_layer('shared_layer').domain_index = 0


In [None]:
import pandas as pd
from transformers import TFDistilBertModel, DistilBertTokenizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, TensorBoard
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn.metrics import f1_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from math import sqrt
import datetime

# Load the DistilBERT model and tokenizer
distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Assuming you've already loaded and processed your data, else this part would error out:
X_train_text = X_train.tolist()
X_test_text = X_test.tolist()

# Tokenizing
train_encodings = tokenizer(X_train_text, truncation=True, padding='max_length', max_length=100)
test_encodings = tokenizer(X_test_text, truncation=True, padding='max_length', max_length=100)

# Convert to numpy arrays
X_train_distilbert = [np.array(train_encodings['input_ids']), np.array(train_encodings['attention_mask'])]
X_test_distilbert = [np.array(test_encodings['input_ids']), np.array(test_encodings['attention_mask'])]

# Common layers function
def common_layers(inputs):
    x = Dense(256, activation='relu')(inputs)
    return Dropout(0.5)(x)

# Define the shared layer
class SharedLayer(tf.keras.layers.Layer):
    def __init__(self, number_of_domains):
        super(SharedLayer, self).__init__()
        self.domain_specific_layers = [Dense(3, activation='softmax') for _ in range(number_of_domains)]

    def call(self, inputs, domain_index=None):
        x = common_layers(inputs)
        return self.domain_specific_layers[domain_index](x)

# Define the input layers
input_ids = Input(shape=(100,), dtype=tf.int32, name='input_ids')
attention_mask = Input(shape=(100,), dtype=tf.int32, name='attention_mask')

# Get the DistilBERT model output
distilbert_output = distilbert_model([input_ids, attention_mask])[0]
pooled_output = tf.keras.layers.GlobalAveragePooling1D()(distilbert_output)  # Pooling strategy for DistilBERT

# Use shared layers
number_of_domains = 2
shared_layer = SharedLayer(number_of_domains)
predictions = shared_layer(pooled_output, 0)

# Model definition using dynamic shared layer
class DynamicSharedModel(Model):
    def __init__(self, base_model, shared_layer, **kwargs):
        super(DynamicSharedModel, self).__init__(**kwargs)
        self.base_model = base_model
        self.shared_layer = shared_layer

    def call(self, inputs, domain_index=0):
        x = self.base_model(inputs)
        return self.shared_layer(x, domain_index)

new_model = DynamicSharedModel(base_model=distilbert_model, shared_layer=shared_layer)
new_model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=1, factor=0.5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', mode='min', save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs', histogram_freq=1)
callbacks_list = [lr_reduction, early_stopping, model_checkpoint, tensorboard]

# Train the model
history = new_model.fit(X_train_distilbert, y_train, batch_size=64, epochs=10, validation_split=0.2, callbacks=callbacks_list, shuffle=True)

# Predict on test set
y_pred = new_model.predict(X_test_distilbert)

# Metrics
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
class_names = ['class1', 'class2', 'class3']
classification_rep = classification_report(y_test_classes, y_pred_classes, target_names=class_names)
print("Classification Report:")
print(classification_rep)
macro_f1 = f1_score(y_test_classes, y_pred_classes, average='macro')
print('Macro F1 Score:', macro_f1)
cm = confusion_matrix(y_test_classes, y_pred_classes)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
rmse = sqrt(mean_squared_error(y_test_classes, y_pred_classes))
print('RMSE:', rmse)

num_epochs = 10
for epoch in range(num_epochs):
    history1 = new_model.fit(X_train_distilbert_domain1, y_train_domain1, batch_size=64, epochs=1, validation_split=0.2, callbacks=callbacks_list, shuffle=True, domain_index=0)
    history2 = new_model.fit(X_train_distilbert_domain2, y_train_domain2, batch_size=64, epochs=1, validation_split=0.2, callbacks=callbacks_list, shuffle=True, domain_index=1)



### Statistic

In [None]:
!pip install scikit_posthocs

In [None]:
import numpy as np
from scipy.stats import rankdata, friedmanchisquare
import scikit_posthocs as sp
import numpy as np
from scipy.stats import rankdata, friedmanchisquare
import scikit_posthocs as sp
import matplotlib.pyplot as plt

# Data
tfidf_folds_covid = [0.757, 0.773, 0.775, 0.762, 0.765]
word2vec_folds_covid = [0.796, 0.806, 0.806, 0.787, 0.807]
fasttext_folds_covid = [0.773, 0.790, 0.793, 0.778, 0.778]
glove_folds_covid = [0.731, 0.750, 0.757, 0.733, 0.748]
roberta_folds_covid = [0.8385, 0.9066, 0.9430, 0.9539, 0.9578]

tfidf_folds_mental = [0.916,0.917,0.917,0.920,0.924]
word2vec_folds_mental = [0.909, 0.907, 0.900, 0.911, 0.909]
fasttext_folds_mental = [0.912, 0.909, 0.909, 0.909, 0.909]
glove_folds_mental = [0.896, 0.898,0.898,0.876,0.902]
roberta_folds_mental = [0.9410,0.9778,0.9845,0.9852,0.9859]

tfidf_folds_drug = [0.727,0.725,0.726,0.726,0.727]
word2vec_folds_drug = [0.693,0.716,0.701,0.710,0.713]
fasttext_folds_drug = [0.729,0.715,0.695,0.706,0.706]
glove_folds_drug = [0.715,0.711,0.721,0.688,0.678]
roberta_folds_drug = [0.715,0.7367,0.8939,0.9110,0.9318]

experiments = {
    'covid': [tfidf_folds_covid, word2vec_folds_covid, fasttext_folds_covid, glove_folds_covid, roberta_folds_covid],
    'mental': [tfidf_folds_mental, word2vec_folds_mental, fasttext_folds_mental, glove_folds_mental, roberta_folds_mental],
    'drug': [tfidf_folds_drug, word2vec_folds_drug, fasttext_folds_drug, glove_folds_drug, roberta_folds_drug]
}

alpha = 0.05
methods = ['tfidf', 'word2vec', 'fasttext', 'glove', 'roberta']

for experiment, data in experiments.items():
    print(f"Experiment: {experiment.upper()}")
    # Friedman Test
    stat, p = friedmanchisquare(*data)
    print(f"Friedman Test Statistic: {stat}")
    print(f"P-value: {p}")

    if p < alpha:
        print("Reject the null hypothesis - No significant differences exist in the performances of the methods.")

        # Calculate average ranks
        scores = np.array(data)
        ranks = np.array([rankdata([-score for score in fold]) for fold in zip(*scores)])  # -score for descending order
        avg_ranks = np.mean(ranks, axis=0)
        method_ranks = dict(zip(methods, avg_ranks))
        print("Average Ranks:", method_ranks)

        # Nemenyi post-hoc test
        pc = sp.posthoc_nemenyi_friedman(np.array(data).T)
        print("P-values matrix from Nemenyi post-hoc test:")
        print(pc)
        print("-" * 50)
    else:
        print("Fail to reject the null hypothesis - No significant differences exist in the performances of the methods.")
        print("-" * 50)


In [None]:
import numpy as np
from scipy.stats import rankdata, friedmanchisquare
import scikit_posthocs as sp
import seaborn as sns
import matplotlib.pyplot as plt

# Data
tfidf_folds_covid = [0.757, 0.773, 0.775, 0.762, 0.765]
word2vec_folds_covid = [0.796, 0.806, 0.806, 0.787, 0.807]
fasttext_folds_covid = [0.773, 0.790, 0.793, 0.778, 0.778]
glove_folds_covid = [0.731, 0.750, 0.757, 0.733, 0.748]
distillbert_folds_covid = [0.8385, 0.9066, 0.9430, 0.9539, 0.9578]

tfidf_folds_mental = [0.916,0.917,0.917,0.920,0.924]
word2vec_folds_mental = [0.909, 0.907, 0.900, 0.911, 0.909]
fasttext_folds_mental = [0.912, 0.909, 0.909, 0.909, 0.909]
glove_folds_mental = [0.896, 0.898,0.898,0.876,0.902]
distillbert_folds_mental = [0.9410,0.9778,0.9845,0.9852,0.9859]

tfidf_folds_drug = [0.727,0.725,0.726,0.726,0.727]
word2vec_folds_drug = [0.693,0.716,0.701,0.710,0.713]
fasttext_folds_drug = [0.729,0.715,0.695,0.706,0.706]
glove_folds_drug = [0.715,0.711,0.721,0.688,0.678]
distillbert_folds_drug = [0.715,0.7367,0.8939,0.9110,0.9318]

experiments = {
    'covid dataset': [tfidf_folds_covid, word2vec_folds_covid, fasttext_folds_covid, glove_folds_covid, distillbert_folds_covid],
    'mental dataset': [tfidf_folds_mental, word2vec_folds_mental, fasttext_folds_mental, glove_folds_mental, distillbert_folds_mental],
    'drug dataset': [tfidf_folds_drug, word2vec_folds_drug, fasttext_folds_drug, glove_folds_drug, distillbert_folds_drug]
}

# Initialize parameters
alpha = 0.05
methods = ['tfidf', 'word2vec', 'fasttext', 'glove', 'distillbert']

# List to store the p-value matrices from each experiment
pc_list = []

for experiment, data in experiments.items():
    print(f"\n\n--- Experiment: {experiment.upper()} ---")

    # Friedman Test
    stat, p = friedmanchisquare(*data)
    print(f"Friedman Test Statistic: {stat:.4f}")
    print(f"P-value: {p:.4f}")

    if p < alpha:
        print("\n[RESULT] Reject the null hypothesis: Significant differences exist in the performances of the methods.\n")

        # Calculate average ranks
        scores = np.array(data)
        ranks = np.array([rankdata([-score for score in fold]) for fold in zip(*scores)])  # -score for descending order
        avg_ranks = np.mean(ranks, axis=0)
        method_ranks = dict(zip(methods, avg_ranks))
        print("Average Ranks:", method_ranks)

        # Nemenyi post-hoc test
        pc = sp.posthoc_nemenyi_friedman(np.array(data).T)
        pc_list.append(pc)  # Store the p-value matrix
        print("\nP-values matrix from Nemenyi post-hoc test:\n")
        print(pc)

    else:
        print("\n[RESULT] Fail to reject the null hypothesis: No significant differences exist in the performances of the methods.")

# Visualization outside of the loop, plotting all three side by side
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
for ax, pc, experiment in zip(axes, pc_list, experiments):
    sns.heatmap(pc, annot=True, cmap='coolwarm', cbar=True, square=True,
                fmt='.3f', xticklabels=methods, yticklabels=methods, ax=ax)
    ax.set_title(f"P-values for {experiment.upper()}")
plt.tight_layout()
plt.show()



In [None]:
import seaborn as sns

sns.heatmap(pc, annot=True, cmap='coolwarm', cbar=True, square=True, fmt='.3f', xticklabels=methods, yticklabels=methods)
plt.title(f"P-values from Nemenyi post-hoc test for {experiment.upper()}")
plt.show()


In [None]:
# Setting the style for academic publications
plt.style.use('seaborn-whitegrid')

# Create a figure and axis objects
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))  # 3 columns for 3 experiments

# Use a grayscale color palette
colors = ['lightgray', '0.7', '0.5', '0.3', 'black']  # Example grayscale colors

# Iterate through experiments and plot the box plots side by side
for ax, (experiment, data) in zip(axes, experiments.items()):
    ax.boxplot(data, vert=True, patch_artist=True, labels=methods)
    ax.set_title(f"{experiment.upper()}")
    ax.set_ylabel('Score')  # You can remove individual y labels if you want a common one
    ax.grid(axis='y', linestyle='--')

    # Fill boxes with grayscale colors
    for patch, color in zip(ax.artists, colors):
        patch.set_facecolor(color)

# Add common title and y-label if necessary
fig.suptitle('Box plot Results Across Experiments based on Micro F1 Scores', fontsize=16)
fig.text(0.08, 0.5, 'Score', va='center', rotation='vertical', fontsize=12)  # Common y-label

# Display the plots
plt.tight_layout()
plt.subplots_adjust(top=0.85)  # Adjust top spacing for the suptitle
plt.show()



In [None]:
for experiment, data in experiments.items():
    print(f"Box plot statistics for {experiment.upper()}:")

    for method, values in zip(methods, data):
        min_val = np.min(values)
        q1 = np.percentile(values, 25)
        median = np.median(values)
        q3 = np.percentile(values, 75)
        max_val = np.max(values)

        print(f"{method}:")
        print(f"  Min: {min_val:.3f}")
        print(f"  Q1: {q1:.3f}")
        print(f"  Median: {median:.3f}")
        print(f"  Q3: {q3:.3f}")
        print(f"  Max: {max_val:.3f}")

    print("-" * 50)


In [None]:
from scipy.stats import wilcoxon

word2vec_folds = [0.796, 0.806, 0.806, 0.787, 0.807]
roberta_folds = [0.8385, 0.9066, 0.9430, 0.9539, 0.9578]

stat, p = wilcoxon(word2vec_folds, roberta_folds)
print(f"Wilcoxon Signed-Rank Test between word2vec and roberta:")
print(f"Statistic: {stat}")
print(f"P-value: {p}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Labels for the methods
labels = ["word2vec", "fasttext", "glove", "roberta"]

# Create the heatmap using seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(pc, annot=True, cmap="coolwarm", xticklabels=labels, yticklabels=labels, linewidths=.5, cbar_kws={'label': 'P-value'})

# Decorations
plt.title("Nemenyi Post-hoc Test Results")
plt.show()
