# NewsMTSC

In [3]:
import pandas as pd
import random

# Read the JSONL files into DataFrames
negative_df = pd.read_json("data/processed_dataset/negative/negative_newsmtsc_s.jsonl", lines=True)
neutral_df = pd.read_json("data/processed_dataset/neutral/neutral_newsmtsc_s.jsonl", lines=True)
positive_df = pd.read_json("data/processed_dataset/positive/positive_newsmtsc_s.jsonl", lines=True)

# Combine the DataFrames into a single DataFrame
combined_df = pd.concat([negative_df, neutral_df, positive_df])

# Extract the article ID from the primary_gid
combined_df["article_id"] = combined_df["primary_gid"].apply(lambda x: x.split("_")[0])

# Group the records by article ID
grouped_df = combined_df.groupby("article_id")

# Shuffle the article IDs
article_ids = list(grouped_df.groups.keys())
random.seed(42)
random.shuffle(article_ids)

# Calculate the split indices
train_split_index = int(len(article_ids) * 0.80)
dev_split_index = int(len(article_ids) * 0.90)
# Split the article IDs into training, development, and testing sets
train_article_ids = article_ids[:train_split_index]
dev_article_ids = article_ids[train_split_index:dev_split_index]
test_article_ids = article_ids[dev_split_index:]

# Create training, development, and testing DataFrames based on the article IDs
train_df = combined_df[combined_df["article_id"].isin(train_article_ids)]
dev_df = combined_df[combined_df["article_id"].isin(dev_article_ids)]
test_df = combined_df[combined_df["article_id"].isin(test_article_ids)]

# Save the training, development, and testing datasets as JSONL files
train_df.drop("article_id", axis=1).to_json("data/processed_dataset/newsmtsc/train.jsonl", orient="records", lines=True, force_ascii=False)
dev_df.drop("article_id", axis=1).to_json("data/processed_dataset/newsmtsc/dev.jsonl", orient="records", lines=True, force_ascii=False)
test_df.drop("article_id", axis=1).to_json("data/processed_dataset/newsmtsc/test.jsonl", orient="records", lines=True, force_ascii=False)

# ABSA-PyTorch

### Positive, Negative and Neutral classes for 3-class problem

In [1]:
import pandas as pd
import random

# Read the JSONL files into DataFrames
negative_df = pd.read_json("data/processed_dataset/negative/negative_absa_pytorch.jsonl", lines=True)
neutral_df = pd.read_json("data/processed_dataset/neutral/neutral_absa_pytorch.jsonl", lines=True)
positive_df = pd.read_json("data/processed_dataset/positive/positive_absa_pytorch.jsonl", lines=True)

# Combine the DataFrames into a single DataFrame
combined_df = pd.concat([negative_df, neutral_df, positive_df])

# Extract the article ID from the primary_gid
combined_df["article_id"] = combined_df["id"].apply(lambda x: x.split("_")[0])

# Group the records by article ID
grouped_df = combined_df.groupby("article_id")

# Shuffle the article IDs
article_ids = list(grouped_df.groups.keys())
random.seed(42)
random.shuffle(article_ids)

# Calculate the split indices
train_split_index = int(len(article_ids) * 0.80)
dev_split_index = int(len(article_ids) * 0.90)

# Split the article IDs into training, development, and testing sets
train_article_ids = article_ids[:train_split_index]
dev_article_ids = article_ids[train_split_index:dev_split_index]
test_article_ids = article_ids[dev_split_index:]

# Create training, development, and testing DataFrames based on the article IDs
train_df = combined_df[combined_df["article_id"].isin(train_article_ids)]
dev_df = combined_df[combined_df["article_id"].isin(dev_article_ids)]
test_df = combined_df[combined_df["article_id"].isin(test_article_ids)]

### Ambivalent class for 4-class problem

In [2]:
import pandas as pd
import random

# Read the JSONL files into DataFrames
ambivalent_df = pd.read_json("data/processed_dataset/ambivalent/ambivalent_absa_pytorch.jsonl", lines=True)


# Combine the DataFrames into a single DataFrame
combined_df = ambivalent_df

# Extract the article ID from the primary_gid
combined_df["article_id"] = combined_df["id"].apply(lambda x: x.split("_")[0])

# Group the records by article ID
grouped_df = combined_df.groupby("article_id")

# Shuffle the article IDs
article_ids = list(grouped_df.groups.keys())
random.seed(42)
random.shuffle(article_ids)

# Calculate the split indices
train_split_index = int(len(article_ids) * 0.80)
dev_split_index = int(len(article_ids) * 0.90)

# Split the article IDs into training, development, and testing sets
train_article_ids = article_ids[:train_split_index]
dev_article_ids = article_ids[train_split_index:dev_split_index]
test_article_ids = article_ids[dev_split_index:]

# Create training, development, and testing DataFrames based on the article IDs
train_df = combined_df[combined_df["article_id"].isin(train_article_ids)]
dev_df = combined_df[combined_df["article_id"].isin(dev_article_ids)]
test_df = combined_df[combined_df["article_id"].isin(test_article_ids)]

In [3]:
# Save the training, development, and testing datasets as JSONL files
train_df.drop("article_id", axis=1).to_json("data/processed_dataset/pyabsa/train_a.jsonl", orient="records", lines=True, force_ascii=False)
dev_df.drop("article_id", axis=1).to_json("data/processed_dataset/pyabsa/dev_a.jsonl", orient="records", lines=True, force_ascii=False)
test_df.drop("article_id", axis=1).to_json("data/processed_dataset/pyabsa/test_a.jsonl", orient="records", lines=True, force_ascii=False)

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
positive_embeddings_df = pd.read_csv('data/processed_dataset/positive/embeddings.csv', header=None)
negative_embeddings_df = pd.read_csv('data/processed_dataset/negative/embeddings.csv', header=None)
neutral_embeddings_df = pd.read_csv('data/processed_dataset/neutral/embeddings.csv', header=None)

# add 

train_indices = train_df.index
dev_indices = dev_df.index
test_indices = test_df.index
combined_embeddings = pd.concat([negative_df, neutral_df, positive_df])
# Select embeddings for training set
train_embeddings = combined_embeddings[train_indices]

In [None]:
# Read the embeddings.csv files for each class
positive_df = pd.read_csv('data/processed_dataset/positive/embeddings.csv', header=None)
negative_df = pd.read_csv('data/processed_dataset/negative/embeddings.csv', header=None)
neutral_df = pd.read_csv('data/processed_dataset/neutral/embeddings.csv', header=None)

# Add class labels to each DataFrame
positive_df['label'] = 2
negative_df['label'] = 0
neutral_df['label'] = 1

# Merge the DataFrames
merged_df = pd.concat([positive_df, negative_df, neutral_df], ignore_index=True)

# Split the merged DataFrame into features (X) and labels (y)
X = merged_df.drop('label', axis=1)
y = merged_df['label']

# Create train-test split with 80% train and 20% test data, and a fixed random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the classifiers
classifiers = {
    'XGBoost': XGBClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
    }