In [None]:
# Import
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk import ngrams, pos_tag, ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# Task 1. Data understanding
1 Download a csv file.  
2 Load this dataset.  

In [None]:
# Read the CSV file into a DataFrame
# If using google colab, the '22204768.csv' file needs to be uploaded first
complete_data = pd.read_csv('22204768.csv')

###  Column selection:

Columns that contain textual content are critical for this classification task. In this case, the 'headline' and 'short_description' columns are particularly valuable for this analysis.

### 'headline' column
### 'headline' + 'short_description' columns

To conduct a comprehensive analysis, I will create two types of variables:

1. data_headline Variable: keeping only the 'headline' column textual information
2. data_combined Variable: It merges the 'headline' and 'short_description' columns to creat a new columns.

I will try both variables in (i) data understanding, (ii) data preparation & modelling part.

### As can be seen later from the results, the data_combined works better.

There are some features analyzed that would make more sense using data_headline, such as sentence length.This provides insights into the effectiveness of headlines and their potential relationship with article categories.

By considering both types of variables, I can experiment and compare the results to see which approach yields better performance or provides more meaningful insights for my classification task.

In [None]:
# Select the 'headline' and 'category' columns
data_headline = complete_data[['headline','category']]
data_headline

In [None]:
# Select the 'headline', 'short_description', and 'category' columns
data_combined = complete_data.copy()
data_combined = complete_data[['headline', 'short_description', 'category']]
data_combined

In [None]:
# creat a new column called 'combined_text'
data_combined = complete_data[['headline', 'short_description', 'category']].copy()
data_combined = data_combined.reset_index(drop=True)
data_combined.loc[:, 'combined_text'] = data_combined['headline'] + ' ' + data_combined['short_description']

In [None]:
# Check the number of missing values
data_combined.isnull().sum()

In [None]:
# Count the occurrences of each category
# It is imbalanced
data_headline['category'].value_counts()

In [None]:
# Remove empty rows
data_headline = data_headline.dropna()
data_combined = data_combined.dropna()

In [None]:
# Filter the WELLNESS and DIVORCE classes
head_WELLNESS_class = data_headline[data_headline['category']== 'WELLNESS']['headline']
head_DIVORCE_class = data_headline[data_headline['category']== 'DIVORCE']['headline']

comb_WELLNESS_class = data_combined[data_combined['category']== 'WELLNESS']['combined_text']
comb_DIVORCE_class = data_combined[data_combined['category']== 'DIVORCE']['combined_text']

In [None]:
head_DIVORCE_class

In [None]:
comb_DIVORCE_class

# Task 1. Data understanding

3 Perform an exploration of the data.  
i) Perform an analysis of the most common terms. Preprocess the text.  
ii) Analyzing other features. Analyse the length of the sentences.

The steps I have chosen for data exploration are based on the provided links and the knowledge I gained from my previous AI and human language processing courses.

### Preprocessing the text :
lowercasing,   
stemming,   
lemmatization,   
stopword removal,   
punctuation removal, etc.  


### Features analysised :
the most common words,    
the less common words,  
sentence length,   
N-grams,   
POS tags,  
sentiment analysis, etc.   

iii) Check the blank values, incorrect data, and outliers.  
I removed the blank values already.   
I will check incorrect data and outliers in the following part.

iv) Comment on your observations.

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')

In [None]:
# Text preprocessing
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = text.split()

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Joining tokens back into text
    processed_text = ' '.join(tokens)

    return processed_text

In [None]:
# Apply text preprocessing
head_WELLNESS_prep = head_WELLNESS_class.apply(preprocess_text)
head_DIVORCE_prep = head_DIVORCE_class.apply(preprocess_text)
comb_WELLNESS_prep = comb_WELLNESS_class.apply(preprocess_text)
comb_DIVORCE_prep = comb_DIVORCE_class.apply(preprocess_text)

In [None]:
head_WELLNESS_class

In [None]:
head_WELLNESS_prep

### data_headline exploration

In [None]:
# Initialize the vectorizer
vectorizer = CountVectorizer(stop_words='english')

In [None]:
# Vectorize the head_WELLNESS_class
words_in_head_WELLNESS = vectorizer.fit_transform(head_WELLNESS_prep)

In [None]:
tokens_and_counts = zip(vectorizer.get_feature_names_out(), np.asarray(words_in_head_WELLNESS.sum(axis=0)).ravel())

In [None]:
tokens_head_WELLNESS = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])

In [None]:
tokens_head_WELLNESS.sort_values("Count", ascending=False, inplace=True)
tokens_head_WELLNESS.reset_index(inplace=True, drop=True)
tokens_head_WELLNESS

In [None]:
most_common_head_WELLNESS = tokens_head_WELLNESS.nlargest(columns="Count", n=10)
most_common_head_WELLNESS

In [None]:
least_common_head_WELLNESS = tokens_head_WELLNESS.nsmallest(columns="Count", n=10)
least_common_head_WELLNESS

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10,8))
sns.barplot(ax=axes[0], data=least_common_head_WELLNESS, x="Token", y ="Count")
sns.barplot(ax=axes[1], data=most_common_head_WELLNESS, x="Token", y ="Count")
axes[0].set(ylabel='Counts', xlabel="Tokens", title="%d Least Frequent Tokens After Text Preprocess" % 10 )
axes[1].set(ylabel='Counts', xlabel="Tokens", title="%d Most Frequent Tokens After Text Preprocess" % 10 )
plt.tight_layout()

In [None]:
# Vectorize the head_DIVORCE_class
words_in_head_DIVORCE = vectorizer.fit_transform(head_DIVORCE_prep)
tokens_and_counts = zip(vectorizer.get_feature_names_out(), np.asarray(words_in_head_DIVORCE.sum(axis=0)).ravel())
tokens_head_DIVORCE = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])
tokens_head_DIVORCE.sort_values("Count", ascending=False, inplace=True)
tokens_head_DIVORCE.reset_index(inplace=True, drop=True)
tokens_head_DIVORCE

In [None]:
most_common_head_DIVORCE = tokens_head_DIVORCE.nlargest(columns="Count", n=10)
most_common_head_DIVORCE

In [None]:
least_common_head_DIVORCE = tokens_head_DIVORCE.nsmallest(columns="Count", n=10)
least_common_head_DIVORCE

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10,8))
sns.barplot(ax=axes[0], data=least_common_head_DIVORCE, x="Token", y ="Count")
sns.barplot(ax=axes[1], data=most_common_head_DIVORCE, x="Token", y ="Count")
axes[0].set(ylabel='Counts', xlabel="Tokens", title="%d Least Frequent Tokens After Text Preprocess" % 10 )
axes[1].set(ylabel='Counts', xlabel="Tokens", title="%d Most Frequent Tokens After Text Preprocess" % 10 )
plt.tight_layout()

In [None]:
# Calculate the length of sentences in each category
# use the variable without text preprocessing
head_WELLNESS_lengths = head_WELLNESS_class.apply(lambda x: len(x.split()))
head_DIVORCE_lengths = head_DIVORCE_class.apply(lambda x: len(x.split()))

In [None]:
# Create a box plot
plt.figure(figsize=(8, 6))
plt.boxplot([head_WELLNESS_lengths, head_DIVORCE_lengths])
plt.xticks([1, 2], ['WELLNESS', 'DIVORCE'])
plt.xlabel('Category')
plt.ylabel('Sentence Length')
plt.title('Distribution of Sentence Lengths in Each Category')
plt.show()

### incorrect data and outliers
Outliers are represented as individual points outside the "whiskers" of the box plot. The whiskers extend to the data points that are within a specific distance from the first quartile (Q1) and the third quartile (Q3) of the data.  
There doesn't appear to be any obvious incorrect data.    
For Outliers, there are some outliers present. However, despite their presence, I have made the decision to retain these outliers as they still hold valuable information for the analysis.  

In [None]:
# N-grams analysis
def analyze_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    n_grams = list(ngrams(tokens, n))
    return n_grams

head_WELLNESS_ngrams = []
head_DIVORCE_ngrams = []

for text in head_WELLNESS_prep:
    head_WELLNESS_ngrams.extend(analyze_ngrams(text, 2))

for text in head_DIVORCE_prep:
    head_DIVORCE_ngrams.extend(analyze_ngrams(text, 2))

In [None]:
# Plotting N-grams
def plot_ngrams(ngrams, title):
    freq_dist = nltk.FreqDist(ngrams)
    freq_dist.plot(20, title=title)

plot_ngrams(head_WELLNESS_ngrams, "WELLNESS N-grams")
plot_ngrams(head_DIVORCE_ngrams, "DIVORCE N-grams")

In [None]:
# POS tagging
def analyze_pos_tags(text):
    tokens = nltk.word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return pos_tags

head_WELLNESS_pos = []
head_DIVORCE_pos = []

for text in head_WELLNESS_prep:
    head_WELLNESS_pos.extend(analyze_pos_tags(text))

for text in head_DIVORCE_prep:
    head_DIVORCE_pos.extend(analyze_pos_tags(text))

In [None]:
# Plotting POS tags
def plot_pos_tags(pos_tags, title):
    tags = [tag for (_, tag) in pos_tags]
    freq_dist = nltk.FreqDist(tags)
    freq_dist.plot(20, title=title)

plot_pos_tags(head_WELLNESS_pos, "WELLNESS POS Tags")
plot_pos_tags(head_DIVORCE_pos, "DIVORCE POS Tags")

In [None]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()

head_WELLNESS_sentiments = [sia.polarity_scores(text)["compound"] for text in head_WELLNESS_prep]
head_DIVORCE_sentiments = [sia.polarity_scores(text)["compound"] for text in head_DIVORCE_prep]

In [None]:
# Plotting Sentiments
def plot_sentiments(sentiments, category):
    plt.hist(sentiments, bins=10, alpha=0.5, label=category)
    plt.xlabel("Sentiment Score")
    plt.ylabel("Frequency")
    plt.title("Sentiment Analysis")
    plt.legend(loc='upper right')
    plt.show()

plot_sentiments(head_WELLNESS_sentiments, "WELLNESS")
plot_sentiments(head_DIVORCE_sentiments, "DIVORCE")

### iv)  Comment for data_headline exploration.
The analysis of the two categories, WELLNESS and DIVORCE, reveals several noteworthy differences in their linguistic characteristics and textual patterns.  

Firstly, examining the most common words in each category indicates distinct thematic focuses. For the "Wellness" category, the most common words include "study," "health," "life," "way," "sleep," "new," "cancer," "make," "day," and "weight." These terms are often associated with topics related to general well-being, healthy lifestyle choices, and self-improvement.In the "Divorce" category, the most common words include "divorce," "marriage," "date," "woman," "ex," "photo," "cheat," "video," "parent," and "relationship." These terms are closely related to the process and aftermath of divorce, including discussions about marriage, separation, and the emotional aspects of ending a relationship.

Analyzing the sentence length in each category demonstrates potential variations in structure. It appears that the "Divorce" category tends to have slightly longer sentences compared to the "Wellness" category. for sentence length outliers, there are some outliers present. However, despite their presence, I have made the decision to retain these outliers as they still hold valuable information for the analysis.

Exploring N-grams, sequences of consecutive words, provided a deeper understanding of linguistic patterns. Examining the top N-grams in wellness category, like "study find" "lose weight" indicating a focus on research and general well-being and general health.  In contrast, the top N-grams in the "divorce" category appear with "date divorce" "blend family". Both are very representative.

POS tagging provides insights into the syntactic structure and grammatical patterns present in the texts. Analyzing the distribution of POS tags in each category helps identify differences in the usage of nouns, verbs, adjectives, and other linguistic elements. The Part-of-Speech (POS) analysis reveals that the distribution of POS tags is relatively similar for both categories.

Sentiment analysis revealed the overall sentiment polarity of the texts in each category. Curiously, the sentiment scores in the "wellness" category were clustered between -0.25 and 0, indicating a slight negative and neutral sentiment. On the other hand, the sentiment scores in the "divorce" category tended to cluster in the range of 0 to 0.25, indicating a mixture of neutral to mildly positive sentiments.

In conclusion, the analysis of various textual features highlights distinct linguistic patterns and thematic differences between the WELLNESS and DIVORCE categories. These insights can be valuable in developing effective classification models or understanding the underlying themes and subjects discussed within each category.

### data_combined exploration

In [None]:
# Vectorize the comb_WELLNESS_class
words_in_comb_WELLNESS = vectorizer.fit_transform(comb_WELLNESS_prep)

In [None]:
tokens_and_counts = zip(vectorizer.get_feature_names_out(), np.asarray(words_in_comb_WELLNESS.sum(axis=0)).ravel())

In [None]:
tokens_comb_WELLNESS = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])

In [None]:
tokens_comb_WELLNESS.sort_values("Count", ascending=False, inplace=True)
tokens_comb_WELLNESS.reset_index(inplace=True, drop=True)
tokens_comb_WELLNESS

In [None]:
most_common_comb_WELLNESS = tokens_comb_WELLNESS.nlargest(columns="Count", n=10)
most_common_comb_WELLNESS

In [None]:
least_common_comb_WELLNESS = tokens_comb_WELLNESS.nsmallest(columns="Count", n=10)
least_common_comb_WELLNESS

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10,8))
sns.barplot(ax=axes[0], data=least_common_comb_WELLNESS, x="Token", y ="Count")
sns.barplot(ax=axes[1], data=most_common_comb_WELLNESS, x="Token", y ="Count")
axes[0].set(ylabel='Counts', xlabel="Tokens", title="%d Least Frequent Tokens After Text Preprocess" % 10 )
axes[1].set(ylabel='Counts', xlabel="Tokens", title="%d Most Frequent Tokens After Text Preprocess" % 10 )
plt.tight_layout()

In [None]:
# Vectorize the comb_DIVORCE_class
words_in_comb_DIVORCE = vectorizer.fit_transform(comb_DIVORCE_prep)
tokens_and_counts = zip(vectorizer.get_feature_names_out(), np.asarray(words_in_comb_DIVORCE.sum(axis=0)).ravel())
tokens_comb_DIVORCE = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])
tokens_comb_DIVORCE.sort_values("Count", ascending=False, inplace=True)
tokens_comb_DIVORCE.reset_index(inplace=True, drop=True)
tokens_comb_DIVORCE

In [None]:
most_common_comb_DIVORCE = tokens_comb_DIVORCE.nlargest(columns="Count", n=10)
most_common_comb_DIVORCE

In [None]:
least_common_comb_DIVORCE = tokens_comb_DIVORCE.nsmallest(columns="Count", n=10)
least_common_comb_DIVORCE

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10,8))
sns.barplot(ax=axes[0], data=least_common_comb_DIVORCE, x="Token", y ="Count")
sns.barplot(ax=axes[1], data=most_common_comb_DIVORCE, x="Token", y ="Count")
axes[0].set(ylabel='Counts', xlabel="Tokens", title="%d Least Frequent Tokens After Text Preprocess" % 10 )
axes[1].set(ylabel='Counts', xlabel="Tokens", title="%d Most Frequent Tokens After Text Preprocess" % 10 )
plt.tight_layout()

In [None]:
# There is not much sense in analyzing the sentence length of the merged data

In [None]:
# N-grams analysis

comb_WELLNESS_ngrams = []
comb_DIVORCE_ngrams = []

for text in comb_WELLNESS_prep:
    comb_WELLNESS_ngrams.extend(analyze_ngrams(text, 2))

for text in comb_DIVORCE_prep:
    comb_DIVORCE_ngrams.extend(analyze_ngrams(text, 2))

In [None]:
# Plotting N-grams

plot_ngrams(comb_WELLNESS_ngrams, "WELLNESS N-grams")
plot_ngrams(comb_DIVORCE_ngrams, "DIVORCE N-grams")

In [None]:
# POS tagging

comb_WELLNESS_pos = []
comb_DIVORCE_pos = []

for text in comb_WELLNESS_prep:
    comb_WELLNESS_pos.extend(analyze_pos_tags(text))

for text in comb_DIVORCE_prep:
    comb_DIVORCE_pos.extend(analyze_pos_tags(text))

In [None]:
# Plotting POS tags

plot_pos_tags(comb_WELLNESS_pos, "WELLNESS POS Tags")
plot_pos_tags(comb_DIVORCE_pos, "DIVORCE POS Tags")

In [None]:
# Sentiment Analysis
sia = SentimentIntensityAnalyzer()

comb_WELLNESS_sentiments = [sia.polarity_scores(text)["compound"] for text in comb_WELLNESS_prep]
comb_DIVORCE_sentiments = [sia.polarity_scores(text)["compound"] for text in comb_DIVORCE_prep]

In [None]:
# Plotting Sentiments

plot_sentiments(comb_WELLNESS_sentiments, "WELLNESS")
plot_sentiments(comb_DIVORCE_sentiments, "DIVORCE")

### iv)  Comment for data_combined exploration.
During the exploration of the data_combined dataset, similar to the data_headline exploration, we can observe significant differences in the most common words, less common words, and n-grams between the two categories.

The advantage of the data_combined dataset is that it yields higher counts and greater variations. By combining the headline and short_description columns, we obtain a more extensive text representation that captures additional information and context from the articles. This increased amount of data leads to a richer and more comprehensive analysis, potentially enhancing the performance of subsequent tasks such as classification.

Furthermore, the Sentiment Analysis results reveal a more possible pattern. The WELLNESS category exhibits more positive sentiment scores.

These differences indicate that combining the headline and short_description columns provides a more comprehensive representation of the articles, capturing additional information that can contribute to our analysis.

### data_combined has greater data variation and should be more useful for classification

# Task 2. Data Preparation & Modelling
4. Splitting the dataset into training, development and test sets.  
i) Choose an appropriate split. Comment on your choices.     
ii) Save your data as separate csv files.

5. Load csv files. Text preprocessing.Apply appropriate preprocessing steps to create a numeric representation of the documents, suitable for classification.
I will use the text preprocessing function from before.

### Use data_combined first, then try data_headline later

In [None]:
data_combined

In [None]:
X = data_combined['combined_text']
y = data_combined['category']

In [None]:
# Split the dataset into training, validation, and test sets
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size=0.2, train_size=0.8)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size=0.25, train_size=0.75)

The proportion of data segmentation is as follows:
Training set: 75% of the original data.
Validation set: 25% of the original data (split from the training set)
Test set: 20% of the original data (split from the remaining data not used for training and validation)
The choice of these percentages is a common practice in machine learning. Let me briefly explain why I split it this way:

Training set: The largest portion of the data is allocated to the training set (75%). This allows the model to learn patterns and relationships in the data more effectively because it has access to a large number of labeled examples.
Validation set: A small portion (25%) is assigned to the validation set. It is used to fine-tune the model and to evaluate its performance during training. The validation set helps to select the best model.
Test set: After the model training and hyperparameter tuning is completed, the test set is completely separated until the final evaluation phase. to obtain an unbiased estimate of its performance on unseen data.
With this partitioning, we can ensure that the model is trained on a sufficiently large training set, validated on a separate data set for hyperparameter tuning, and evaluated on a completely separate test set to assess its generalization ability.

In [None]:
# Save the data as separate CSV files
train_data = pd.DataFrame({'combined_text': X_train, 'category': y_train})
valid_data = pd.DataFrame({'combined_text': X_valid, 'category': y_valid})
test_data = pd.DataFrame({'combined_text': X_test, 'category': y_test})

train_data.to_csv('train.csv', index=False)
valid_data.to_csv('valid.csv', index=False)
test_data.to_csv('test.csv', index=False)

In [None]:
# Load the data from CSV files
train_data = pd.read_csv('train.csv')
valid_data = pd.read_csv('valid.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# Apply preprocessing to the text data
train_data['preprocessed_text'] = train_data['combined_text'].apply(preprocess_text)
valid_data['preprocessed_text'] = valid_data['combined_text'].apply(preprocess_text)
test_data['preprocessed_text'] = test_data['combined_text'].apply(preprocess_text)

In [None]:
# Split the data into features (X) and target variable (y)
X_train = train_data['preprocessed_text']
X_valid = valid_data['preprocessed_text']
X_test = test_data['preprocessed_text']
y_train = train_data['category']
y_valid = valid_data['category']
y_test = test_data['category']

In [None]:
# Create a numeric representation of the documents
vectorizer = CountVectorizer(stop_words='english')
X_train_matrix = vectorizer.fit_transform(train_data['preprocessed_text'])
X_valid_matrix = vectorizer.transform(valid_data['preprocessed_text'])
X_test_matrix = vectorizer.transform(test_data['preprocessed_text'])

In [None]:
# Encode the target variable into numeric labels
# label_encoder = LabelEncoder()
# y_train_encoded = label_encoder.fit_transform(y_train)
# y_valid_encoded = label_encoder.transform(y_valid)
# y_test_encoded = label_encoder.transform(y_test)

In [None]:
print(X_train_matrix.shape)
print(X_valid_matrix.shape)
print(X_test_matrix.shape)

# Task 2. Data Preparation & Modelling

6. Build binary classification models using two classifiers.Comment on your choices for the classifier and parameters used in each classifier.

In this task, I experimented with five different classifiers to classify news articles into two categories.
Find the two classifiers that performed the best.

### Five tested classifiers:   
1 Logistic Regression  
2 Random Forest  
3 Support Vector Machine (SVM)  
4 Naive Bayes  
5 K-Nearest Neighbors (KNN)  

### Choices for parameters
1 Logistic Regression:      
all default parameters, which generally work well for many classification tasks.  
2 Random Forest:  
set the number of estimators (n_estimators) to 100. Increasing the number of estimators can potentially improve the model's performance, but it comes with a higher computational cost. 100 is a common value that provides a good balance between accuracy and efficiency.  
3 Support Vector Machine (SVM):  
kernel='linear'. The linear kernel works well when the classes are well-separated in the feature space.   
4 Naive Bayes:  
the classifier's default settings are suitable for this task.  
5 K-Nearest Neighbors (KNN):  
the classifier's default settings are suitable for this task.


7. Build or apply an end-to-end classifier using deep learning. You can either train your own deep learning model from scratch or fine-tune an existing model. Save this model in an appropriate format.

### Deep Learning
Since deep learning requires too much computing power and my laptop can't support it.   
I switched to colab but the training takes too long and often reconnects. After ran it for several days, I finally managed to store my first model.   
I separated the deep learning model training individually in the end. And for the later tasks, I will no longer adjust the model.  

### Two final choices of classifiers: Random Forest and Naive Bayes
After evaluating the classification reports, the two classifiers that stand out as the best performers are Random Forest and Naive Bayes. Both classifiers consistently demonstrate high precision, recall, and F1-score for both classes (DIVORCE and WELLNESS). They maintain a good balance in classifying articles from both categories.

Therefore, I have chosen Random Forest and Naive Bayes as the top two classifiers for this task, as they exhibit strong and consistent performance in classifying the news articles into the correct topical categories.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Initialize the classifiers with desired parameters
logreg = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100)
svm_classifier = SVC(kernel='linear')
naive_bayes = MultinomialNB()
knn = KNeighborsClassifier()

In [None]:
# Train the Logistic Regression classifier
logreg.fit(X_train_matrix, y_train)

# Train the Random Forest classifier
rf.fit(X_train_matrix, y_train)

# Train the SVM classifier
svm_classifier.fit(X_train_matrix, y_train)

# Train the Naive Bayes classifier
naive_bayes.fit(X_train_matrix, y_train)

# Train the K-Nearest Neighbors classifier
knn.fit(X_train_matrix, y_train)

In [None]:
# Make predictions on the validation set
logreg_pred = logreg.predict(X_valid_matrix)
rf_pred = rf.predict(X_valid_matrix)
svm_pred = svm_classifier.predict(X_valid_matrix)
nb_pred = naive_bayes.predict(X_valid_matrix)
knn_pred = knn.predict(X_valid_matrix)

In [None]:
# Print classification report for each classifier
print("Logistic Regression Classification Report:")
print(classification_report(y_valid, logreg_pred))

print("Random Forest Classification Report:")
print(classification_report(y_valid, rf_pred))

print("SVM Classification Report:")
print(classification_report(y_valid, svm_pred))

print("Naive Bayes Classification Report:")
print(classification_report(y_valid, nb_pred))

print("KNN Classification Report:")
print(classification_report(y_valid, knn_pred))

### Check train set

In [None]:
# Make predictions on the train set
rf_train_pred = rf.predict(X_train_matrix)
nb_train_pred = naive_bayes.predict(X_train_matrix)

print("Random Forest Classification Report - Train Set:")
print(classification_report(y_train, rf_train_pred))
print("Naive Bayes Classification Report - Train Set:")
print(classification_report(y_train, nb_train_pred))

### Task 2. Data Preparation & Modelling
7. Build or apply an end-to-end classifier using deep learning. For this task, you can either train your own deep learning model from scratch or fine-tune an existing model. Save this model in an appropriate format.

In [None]:
# Use Colab to run, my computer is not able to run

! pip install transformers
from transformers import AutoTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

In [None]:
# Apply preprocessing to the text data
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# Tokenize the input text and create TensorFlow datasets
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(X_valid.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [None]:
# Encode the target variable into numeric labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train_encoded))
valid_dataset = tf.data.Dataset.from_tensor_slices((dict(valid_encodings), y_valid_encoded))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test_encoded))

In [None]:
# Load the pre-trained DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

The pre-trained DistilBERT model can be fine-tuned on specific downstream tasks such as text classification, sentiment analysis, question answering, and more. By leveraging the pre-trained knowledge from the large corpus, the DistilBERT model can be adapted and specialized for specific NLP tasks with less training data and computational resources compared to training from scratch.

In [None]:
# Define the optimizer, loss function, and metrics
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

Optimizer: The optimizer is responsible for updating the model's parameters during training to minimize the loss. In this case, the code uses the Adam optimizer, which is a popular optimization algorithm known for its efficiency in deep learning.
Loss function: The loss function measures the discrepancy between the predicted output of the model and the true labels. It provides a measure of how well the model is performing during training.
Metrics: Metrics are used to evaluate the performance of the model during training. They provide additional information about the model's accuracy or performance beyond the loss function. In this case, the code specifies the Sparse Categorical Accuracy as the metric. It computes the accuracy of the model's predictions by comparing them to the true labels.

In [None]:
# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
# You can skip and directly go to the Load Model step.

# Train the model

# Be careful. Google Colab takes half a day to complete this step.

# The first set of parameters was more ideal, but there wasn't enough computational capacity.
# It runs all day but constantly restarting/disconnected/failures. Couldn't get a result.
# So tuned down the parameters.

# model.fit(
#     x=train_dataset.shuffle(1000).batch(16),
#     validation_data=valid_dataset.batch(16),
#     epochs=5
# )

model.fit(
    x=train_dataset.shuffle(1000).batch(32),
    validation_data=valid_dataset.batch(32),
    epochs=3
)

accuracy

The output:
Epoch 1/3
149/149 [==============================] - 3264s 22s/step - loss: 0.2170 - accuracy: 0.9125 - val_loss: 0.1347 - val_accuracy: 0.9446
Epoch 2/3
149/149 [==============================] - 3206s 22s/step - loss: 0.0958 - accuracy: 0.9694 - val_loss: 0.1077 - val_accuracy: 0.9585
Epoch 3/3
149/149 [==============================] - 3182s 21s/step - loss: 0.0519 - accuracy: 0.9828 - val_loss: 0.1215 - val_accuracy: 0.9597
<keras.callbacks.History at 0x7a6ddccaa5f0>

High Accuracy: The model achieves high accuracy on both the training and validation datasets. In the last epoch, the training accuracy reaches 98.28%, and the validation accuracy is 95.97%. High accuracy on the validation set indicates that the model is generalizing well to unseen data, which is a positive sign.  
Decreasing Loss: The training loss steadily decreases with each epoch, indicating that the model is effectively learning from the training data and minimizing its error. The validation loss also shows a decreasing trend, which suggests that the model is not overfitting to the training data.  
Consistent Improvement: The accuracy and loss metrics consistently improve across the epochs. This behavior indicates that the model continues to learn and refine its predictions with additional training.  
Reasonable Training Time: The training time per epoch seems reasonable, considering the loss and accuracy improvements achieved. Long training times can be a concern, but it depends on the complexity of the model and the available computing resources.  

Overall, based on these accuracy values, the model appears to be performing well and achieving a high level of accuracy in classifying news articles into their respective topical categories.

In [None]:
# You can skip and directly go to the Load Model step.
# Save the trained model in the same folder as the Jupyter Notebook
model.save_pretrained('./')

In [None]:
# Load the saved model
loaded_model = TFDistilBertForSequenceClassification.from_pretrained('./')

# Display the model summary and configuration
loaded_model.summary()
loaded_model.config

In [None]:
# Generate predictions on the validation set
y_pred_valid = loaded_model.predict(valid_dataset.batch(16))
y_pred_valid_labels = label_encoder.inverse_transform(tf.argmax(y_pred_valid.logits, axis=1))

In [None]:
# Generate the classification report on the validation set
classification_report_valid = classification_report(y_valid, y_pred_valid_labels)
print("Classification Report - Deep Learning (Validation Set):")
print(classification_report_valid)

### Deep learning Error analysis

In [None]:
# Confusion Matrix for Deep learning
cm_valid = confusion_matrix(y_valid, y_pred_valid_labels)
ax_valid = plt.subplot()
sns.heatmap(cm_valid, annot=True, fmt='g', ax=ax_valid)
ax_valid.set_xlabel('Predicted labels')
ax_valid.set_ylabel('True labels')
ax_valid.set_title('Confusion Matrix - Deep Learning')
plt.show()

# Task 3. Evaluation

8. Choose a primary metric that will be used to evaluate your models. Justify your choice. Comment on what is a good benchmark for this task.

### Imbalanced dataset
When dealing with an imbalanced dataset like this, accuracy alone may not provide an accurate representation of the classifier's performance. It is crucial to consider additional metrics to evaluate classification models on imbalanced datasets.

The classification report provides detailed insights into the performance of each classifier. The important metrics include precision, recall, and F1-score. Precision measures the ability of the classifier to correctly identify positive instances, while recall measures the ability to capture all positive instances. F1-score provides a balance between precision and recall.

Given that the categories in the dataset are imbalanced, with a larger number of articles in the "WELLNESS" category compared to the "DIVORCE" category, accuracy alone may not be the most informative metric. Instead, it is important to consider both precision and recall to assess the performance of the classifiers.

If the goal is to minimize false positives (incorrectly classifying an article as "DIVORCE"), precision would be a crucial metric. On the other hand, if the goal is to minimize false negatives (incorrectly classifying an article as "WELLNESS"), recall becomes more important.

To strike a balance between precision and recall, the F1-score (harmonic mean of precision and recall) could be a suitable primary metric for evaluation. It provides a single measure that considers both precision and recall, making it useful for imbalanced datasets where both false positives and false negatives are important.

A good benchmark for this task could be based on domain expertise or existing research in the field. This benchmark could be used to compare the models' F1-scores and determine their effectiveness in classifying the news articles.

9. Evaluate the performance of each model developed on Task 2 (items 6 and 7) on your train and validation sets. How does the performance on the train set compare to the validation set? Comment on the performance of the classifiers/models.
In general, the performance on the validation set of those models is similar to the performance on the training set, except KNN, it suggests that the model is generalizing well.

### Logistic Regression:
Achieves high precision, recall, and F1-scores for both categories.
The recall for the "DIVORCE" category is slightly lower on the validation set (0.85) .

### Random Forest:
Achieves high precision, recall, and F1-scores for both categories.
A bit low for DOVORCE but still all over 0.9. Very high for WELLNESS.
Shows strong performance in classifying both "DIVORCE" and "WELLNESS" articles.

### SVM:
Achieves high precision, recall, and F1-scores for both categories.
A bit low for DIVORCE recall which is 0.87.

### Naive Bayes:
Achieves high precision, recall, and F1-scores for both categories.
A bit low for DOVORCE but still all over 0.9. Very high for WELLNESS.
Shows strong performance in classifying both "DIVORCE" and "WELLNESS" articles.

### KNN:
Shows lower performance compared to other classifiers.

### Deep Learning:
Achieves high precision, recall, and F1-scores for both categories.
Shows strong performance in classifying both "DIVORCE" and "WELLNESS" articles.

### For the following task, I will use the two most suitable classifiers: : Random Forest and Naive Bayes

10. Perform an error analysis.

### Random Forest Error analysis

In [None]:
# Confusion Matrix
cm_rf = confusion_matrix(y_valid, rf_pred)
ax_rf = plt.subplot()
sns.heatmap(cm_rf, annot=True, fmt='g', ax=ax_rf)
ax_rf.set_xlabel('Predicted labels')
ax_rf.set_ylabel('True labels')
ax_rf.set_title('Confusion Matrix - Random Forest')
plt.show()

In [None]:
# Error Analysis - Random Forest
valid_data_rf = pd.DataFrame({'combined_text': X_valid, 'category': y_valid, 'predicted_category': rf_pred})
misclassified_instances_rf = valid_data_rf[valid_data_rf['category'] != valid_data_rf['predicted_category']]
misclassified_instances_rf.to_csv('error_analysis_rf_valid.csv', index=False)

In [None]:
misclassified_instances_rf

### Naive Bayes Error analysis

In [None]:
# Confusion Matrix
cm_nb = confusion_matrix(y_valid, nb_pred)
ax_nb = plt.subplot()
sns.heatmap(cm_nb, annot=True, fmt='g', ax=ax_nb)
ax_nb.set_xlabel('Predicted labels')
ax_nb.set_ylabel('True labels')
ax_nb.set_title('Confusion Matrix - Naive Bayes')
plt.show()

In [None]:
# Error Analysis - Naive Bayes
valid_data_nb = pd.DataFrame({'combined_text': X_valid, 'category': y_valid, 'predicted_category': nb_pred})
misclassified_instances_nb = valid_data_nb[valid_data_nb['category'] != valid_data_nb['predicted_category']]
misclassified_instances_nb.to_csv('error_analysis_nb_valid.csv', index=False)

In [None]:
misclassified_instances_nb

10. Perform an error analysis for each model tested on the previous item. Comment on your results. Consider things like: did the different models classify the same sentences incorrectly? What have you learned from this analysis?

Insights from the Confusion Matrix:  

The confusion matrix shows that the Random Forest model / the Naive Bayes model performed well overall, with a large number of true positives (TP) and true negatives (TN).
The number of false positives (FP) and false negatives (FN) is relatively low, indicating that the model's misclassifications are limited.

Misclassified Instances:  
Some patterns and observations from the misclassifications:

Overlapping Themes: Some misclassified instances seem to contain overlapping themes or topics between the "WELLNESS" and "DIVORCE" categories. This overlap could make the classification challenging for the model.

Contextual Ambiguity: In some cases, the model might have misclassified sentences due to contextual ambiguity or subtle nuances that could affect the category determination.

Uncommon Phrases: The model could struggle with sentences that contain uncommon phrases, domain-specific terminology, or slang not present in the training data.

Similar Sentence Structure: Some misclassifications might be due to sentences with similar structures but different meanings, leading the model to make incorrect predictions.

Emotional Language: Sentences with emotionally charged or ambiguous language might lead to misclassifications, as the model might have difficulty understanding the underlying sentiment.

Limited Data: Instances with fewer examples in the training data could be prone to misclassification, as the model may not have enough exposure to learn the patterns in those instances.

11. Apply at least one change to each classifier/model developed on Task 2 and redo your evaluation. Think of a change that can help improve the metric you are using to evaluate your models. This change can either be a change of a parameter, or a different preprocessing or any other change you may find interesting to implement after doing an error analysis. Depending on your primary metric, you may want to consider strategies to address the imbalance in your dataset. Save these models in an appropriate format. Comment on your choices and results. Could you achieve the benchmark you expected for this task?

### For the previous code, I've considered a variety of changes and used the better performing version. Here are some of the adjustments I have tried, but below are the worse options.

###  Change to classifier/model parameter
In order to improve the performance of the classifier/model for unbalanced datasets, I queried how to parameterize it for such data.

Random Forest Classifier:
We can adjust the class_weight parameter in the Random Forest classifier to give more importance to the minority class (DIVORCE) during training. This can help mitigate the impact of class imbalance.

Naive Bayes Classifier:
For the Naive Bayes classifier, we can use the ComplementNB variant, It corrects the "complement" of each class's empirical probability, thereby handling imbalanced data more effectively.

These parameter changes did not significantly change the performance of the classifier

For the deep learning model, this parameter change can significantely improve the performance, But it is strongly recommended not to run it. When I run google colab, it needs to run for a whole day, and it cannot be saved after the connection is interrupted.
model.fit(
    x=train_dataset.shuffle(1000).batch(16),
    validation_data=valid_dataset.batch(16),
    epochs=5
)

### different preprocessing
The text preprocessing can significantly improve model performance.
I set up the preprocessor function at the beginning and each step improves the performance.

### other change
data_combined instead of data_headline can significantly improve model performance.

Results have met the expected level.

### Try different parameter

In [None]:
# Initialize the Random Forest classifier with class weights
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced')

# Train the Random Forest classifier
rf.fit(X_train_matrix, y_train)

# Initialize the Naive Bayes classifier with class weights
naive_bayes = MultinomialNB(class_prior=None)
# Use class_prior=None to ensure class weights are calculated based on the data
naive_bayes.fit(X_train_matrix, y_train)

In [None]:
rf_pred = rf.predict(X_valid_matrix)
nb_pred = naive_bayes.predict(X_valid_matrix)

In [None]:
print("Random Forest Classification Report:")
print(classification_report(y_valid, rf_pred))

print("Naive Bayes Classification Report:")
print(classification_report(y_valid, nb_pred))

Random Forest:

The original Random Forest model achieved an overall accuracy of 96%, with precision, recall, and F1-score for both classes (DIVORCE and WELLNESS) in the high 90s range. This is a good performance.
After changing the Random Forest classifier to use class weights (class_weight='balanced'), there was a slight decrease in performance in terms of recall for the DIVORCE class. The recall for the DIVORCE class dropped from 0.90 to 0.89.
The weighted average F1-score for the Random Forest classifier remained the same after introducing class weights, indicating that the change did not have a substantial impact on the overall performance.

Naive Bayes:

Similarly, the original Naive Bayes model achieved an overall accuracy of 96%, with precision, recall, and F1-score for both classes (DIVORCE and WELLNESS) in the high 90s range. This is also a good performance.
After incorporating class weights into the Naive Bayes classifier (class_prior=None), the performance remained virtually the same. There were no notable improvements or declines in precision, recall, or F1-scores for either class.

The data might already be well-separated in the feature space, allowing both classifiers to perform well without much influence from class weights.
It's possible that the original Random Forest and Naive Bayes models were already adequately handling class imbalance or were not significantly impacted by it in this specific dataset.
The initial performance of both classifiers was already high, and class weights might not have been necessary in this particular scenario.


### Try data_headline
We can see that the performance of the classifier is significantly lower, and after this, I will only use the data_combined.

In [None]:
data_headline

In [None]:
Xh = data_headline['headline']
yh = data_headline['category']
# Split the dataset into training, validation, and test sets
Xh_train_plus_valid, Xh_test, yh_train_plus_valid, yh_test = train_test_split(Xh, yh, random_state=0, test_size=0.2, train_size=0.8)
Xh_train, Xh_valid, yh_train, yh_valid = train_test_split(Xh_train_plus_valid, yh_train_plus_valid, random_state=0, test_size=0.25, train_size=0.75)
# Apply preprocessing to the text data
Xh_train = Xh_train.apply(preprocess_text)
Xh_valid = Xh_valid.apply(preprocess_text)
# Create a numeric representation of the documents
vectorizer = CountVectorizer(stop_words='english')
Xh_train_matrix = vectorizer.fit_transform(Xh_train)
Xh_valid_matrix = vectorizer.transform(Xh_valid)

# Train the Random Forest classifier
rf.fit(Xh_train_matrix, yh_train)
# Train the Naive Bayes classifier
naive_bayes.fit(Xh_train_matrix, yh_train)
rf_predh = rf.predict(Xh_valid_matrix)
nb_predh = naive_bayes.predict(Xh_valid_matrix)

In [None]:
print("Random Forest Classification Report:")
print(classification_report(yh_valid, rf_predh))
print("Naive Bayes Classification Report:")
print(classification_report(yh_valid, nb_predh))

### Try without preprocessing

In [None]:
data_headline

In [None]:
Xh = data_headline['headline']
yh = data_headline['category']
# Split the dataset into training, validation, and test sets
Xh_train_plus_valid, Xh_test, yh_train_plus_valid, yh_test = train_test_split(Xh, yh, random_state=0, test_size=0.2, train_size=0.8)
Xh_train, Xh_valid, yh_train, yh_valid = train_test_split(Xh_train_plus_valid, yh_train_plus_valid, random_state=0, test_size=0.25, train_size=0.75)
# Create a numeric representation of the documents
vectorizer = CountVectorizer(stop_words='english')
Xh_train_matrix = vectorizer.fit_transform(Xh_train)
Xh_valid_matrix = vectorizer.transform(Xh_valid)

# Train the Random Forest classifier
rf.fit(Xh_train_matrix, yh_train)
# Train the Naive Bayes classifier
naive_bayes.fit(Xh_train_matrix, yh_train)
rf_predh = rf.predict(Xh_valid_matrix)
nb_predh = naive_bayes.predict(Xh_valid_matrix)

In [None]:
print("Random Forest Classification Report:")
print(classification_report(yh_valid, rf_predh))
print("Naive Bayes Classification Report:")
print(classification_report(yh_valid, nb_predh))

12.Merge your train and validation sets and perform cross validation using the classifiers from item 11. Comment on your results.

In [None]:
# Merge the training and validation sets
X_train_plus_valid = pd.concat([train_data['preprocessed_text'], valid_data['preprocessed_text']])
y_train_plus_valid = pd.concat([train_data['category'], valid_data['category']])

In [None]:
# Create a numeric representation of the documents
vectorizer = CountVectorizer(stop_words='english')
X_train_plus_valid_matrix = vectorizer.fit_transform(X_train_plus_valid)

In [None]:
# Initialize the classifiers with desired parameters
rf = RandomForestClassifier(n_estimators=100)
naive_bayes = MultinomialNB()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# Perform cross-validation for Random Forest
cross_val_rf = cross_val_score(rf, X_train_plus_valid_matrix, y_train_plus_valid, cv=5)

# Perform cross-validation for Naive Bayes
cross_val_nb = cross_val_score(naive_bayes, X_train_plus_valid_matrix, y_train_plus_valid, cv=5)

In [None]:
# Print the cross-validation scores
print("Random Forest Cross-Validation Scores:", cross_val_rf)
print("Naive Bayes Cross-Validation Scores:", cross_val_nb)

Cross-validation scores provide an estimate of the models' performance on unseen data and can serve as a good indicator of their generalization ability.

The Random Forest model shows relatively consistent performance across different folds of the data, with cross-validation scores ranging from approximately 94.9% to 95.9%.
The average cross-validation score is around 95.4%, indicating that the model performs reasonably well on unseen data, achieving an accuracy of about 95.4%.

The Naive Bayes model also demonstrates consistent performance across different folds, with cross-validation scores ranging from approximately 93.9% to 95.2%.
The average cross-validation score is around 94.6%, suggesting that the Naive Bayes model is slightly less accurate compared to the Random Forest model, achieving an average accuracy of about 94.6%.


13. Choose the best model from the previous item, load it using the files created on item 11 and apply it to the test set (test.csv).Make sure you are preprocessing the test set exactly the same way you preprocessed the data you used to train the model.

The Random Forest classifier has slightly higher cross-validation scores compared to Naive Bayes. The deep learning models also performed well, but often caused the program to crash. Therefore, I chose the Random Forest model as the best model.

To ensure that the test set is preprocessed in exactly the same way as the data used to train the model. The same processing has been performed earlier, and here only to load the data.

14. Retrain the best model from Task 3 (item 11) using the train and validation datasets and now apply to the test set. Did training the model with more data make any difference? Comment on your results.

In [None]:
# Create a single CountVectorizer and fit it on the combined training and validation set
vectorizer = CountVectorizer(stop_words='english')
X_train_plus_valid_matrix = vectorizer.fit_transform(X_train_plus_valid)

X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)

X_test_matrix = vectorizer.transform(X_test)

In [None]:
# Train the Random Forest classifier on the training dataset alone
rf_train = RandomForestClassifier(n_estimators=100)
rf_train.fit(X_train_matrix, y_train)

In [None]:
# Train the Random Forest classifier on the combined training and validation dataset
rf_train_valid = RandomForestClassifier(n_estimators=100)
rf_train_valid.fit(X_train_plus_valid_matrix, y_train_plus_valid)

In [None]:
X_test_matrix

In [None]:
# Apply the Random Forest model to the test set
y_pred_test_rf_train_valid = rf_train_valid.predict(X_test_matrix)
y_pred_test_rf_train = rf_train.predict(X_test_matrix)

In [None]:
# Evaluate the models on the test set
accuracy_rf_train = accuracy_score(y_test, y_pred_test_rf_train)
accuracy_rf_train_valid = accuracy_score(y_test, y_pred_test_rf_train_valid)

In [None]:
print("Test Set Accuracy - Random Forest trained on Train Data:", accuracy_rf_train)
print("Test Set Accuracy - Random Forest trained on Train + Validation Data:", accuracy_rf_train_valid)

In [None]:
print("Random Forest Classification Report - Train Data:")
print(classification_report(y_test, y_pred_test_rf_train))

In [None]:
print("Random Forest Classification Report - Train + Validation Data:")
print(classification_report(y_test, y_pred_test_rf_train_valid))

Test Set Accuracy: The accuracy on the Test set increased from approximately 95.09% (trained on Train data) to 95.53% (trained on Train + Validation data). This suggests that the model trained on more data is better at making correct predictions on unseen examples.
Precision, Recall, and F1-score: For both classes (DIVORCE and WELLNESS), the precision, recall, and F1-scores remain high in both cases. This indicates that the model is good at correctly identifying instances of each class, with the combined dataset model slightly outperforming the Train data model.
Overall Performance: The macro and weighted average F1-scores for the Train + Validation data model are slightly higher than those for the Train data model. This means the model trained on more data achieves a better balance between class-specific and overall performance.
Class Imbalance: Since the performance metrics for both classes are high, it appears that the model handles the class imbalance well, which is evident from the similar precision, recall, and F1-scores for both classes.
Overall, the increase in accuracy and slight improvement in other performance metrics suggest that training the model with more data (Train + Validation) has positively affected its performance. It showcases how leveraging additional data for training can lead to better generalization and robustness of the model, making it more effective in making accurate predictions on unseen data (Test set). However, the improvement might be considered modest, as the increase in accuracy is not substantial, but any improvement in real-world applications is valuable.