In [10]:
from transformers import AutoTokenizer, AutoModel
import json
import torch
import torch.nn.functional as F
import pandas as pd

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

with open('data/processed_dataset/negative/negative_absa_pytorch.jsonl', 'r') as f:
    for line in f:
        try:
            record = json.loads(line)
            encoded_input = tokenizer(record['sentence'], text_pair=record['aspect'], padding=True, truncation=True, return_tensors='pt')
            
            with torch.no_grad():
                model_output = model(**encoded_input)
            
            sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
            sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
            sentence_embedding_2d = sentence_embedding.squeeze().tolist()
            
            num_columns = len(sentence_embedding_2d)
            column_names = [f'{i}' for i in range(num_columns)]
            df = pd.DataFrame([sentence_embedding_2d], columns=column_names)
            
            df.to_csv('data/processed_dataset/negative/embeddings.csv', mode='a', header=False, index=False)
            
        except Exception as e:
            print(e)
            continue

In [39]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import json
import random

In [18]:
positive_ids = []
negative_ids = []
neutral_ids = []

In [21]:
with open ('data/processed_dataset/neutral/neutral_absa_pytorch.jsonl', 'r') as f:
    # get article id from each line
    for record in f:
        record = json.loads(record)
        article_id = record['id']
        neutral_ids.append(article_id)

In [29]:
positive_df = pd.read_csv('data/processed_dataset/positive/embeddings.csv', header=None)
negative_df = pd.read_csv('data/processed_dataset/negative/embeddings.csv', header=None)
neutral_df = pd.read_csv('data/processed_dataset/neutral/embeddings.csv', header=None)

In [33]:
positive_df = pd.read_csv('data/processed_dataset/positive/embeddings.csv', header=None)
negative_df = pd.read_csv('data/processed_dataset/negative/embeddings.csv', header=None)
neutral_df = pd.read_csv('data/processed_dataset/neutral/embeddings.csv', header=None)
# add column names
positive_df.columns = [f'{i}' for i in range(384)]

In [None]:
positive_df

In [60]:
# Read the embeddings.csv files for each class
positive_df = pd.read_csv('data/processed_dataset/positive/embeddings.csv', header=None)
negative_df = pd.read_csv('data/processed_dataset/negative/embeddings.csv', header=None)
neutral_df = pd.read_csv('data/processed_dataset/neutral/embeddings.csv', header=None)
# add column names
positive_df.columns = [f'{i}' for i in range(384)]
negative_df.columns = [f'{i}' for i in range(384)]
neutral_df.columns = [f'{i}' for i in range(384)]
# add id column

positive_df['article_id'] = positive_ids
negative_df['article_id'] = negative_ids
neutral_df['article_id'] = neutral_ids

positive_df['label'] = 2
negative_df['label'] = 0
neutral_df['label'] = 1

# Merge the DataFrames
combined_df = pd.concat([positive_df, negative_df, neutral_df], ignore_index=True)

grouped_df = combined_df.groupby("article_id")

# Shuffle the article IDs
article_ids = list(grouped_df.groups.keys())
random.seed(42)
random.shuffle(article_ids)

# Calculate the split indices
train_split_index = int(len(article_ids) * 0.80)
dev_split_index = int(len(article_ids) * 0.90)
# Split the article IDs into training, development, and testing sets
train_article_ids = article_ids[:train_split_index]
dev_article_ids = article_ids[train_split_index:dev_split_index]
test_article_ids = article_ids[dev_split_index:]

# Create training, development, and testing DataFrames based on the article IDs
train_df = combined_df[combined_df["article_id"].isin(train_article_ids)]
dev_df = combined_df[combined_df["article_id"].isin(dev_article_ids)]
test_df = combined_df[combined_df["article_id"].isin(test_article_ids)]


In [61]:
X_train, y_train = train_df.drop(columns=["article_id", "label"]), train_df["label"]
X_dev, y_dev = dev_df.drop(columns=["article_id", "label"]), dev_df["label"]
X_test, y_test = test_df.drop(columns=["article_id", "label"]), test_df["label"]

In [6]:
# Read the embeddings.csv files for each class
positive_df = pd.read_csv('data/processed_dataset/positive/embeddings.csv', header=None)
negative_df = pd.read_csv('data/processed_dataset/negative/embeddings.csv', header=None)
neutral_df = pd.read_csv('data/processed_dataset/neutral/embeddings.csv', header=None)

# Add class labels to each DataFrame
positive_df['label'] = 2
negative_df['label'] = 0
neutral_df['label'] = 1

# Merge the DataFrames
merged_df = pd.concat([positive_df, negative_df, neutral_df], ignore_index=True)

# Split the merged DataFrame into features (X) and labels (y)
X = merged_df.drop('label', axis=1)
y = merged_df['label']

In [None]:
# Create train-test split with 80% train and 20% test data, and a fixed random seed
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from itertools import product


classifiers = {
    'XGBoost': {
        'model': XGBClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.1, 0.3],
            'max_depth': [3, 5]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 5],
            'min_samples_split': [5, 10]
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'C': [1, 10],
            'kernel': ['linear', 'rbf', 'poly'],
        }
    }
}

# # Perform k-fold cross-validation for each classifier
best_params = {}
for name, classifier in classifiers.items():
    model = classifier['model']
    params = classifier['params']
    # create combinations of hyperparameters
    params = [{k: v for k, v in zip(params.keys(), values)} for values in product(*params.values())]
    for param in params:
        model.set_params(**param)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_dev)
        accuracy = accuracy_score(y_dev, y_pred)
        print(f"Accuracy for {name} and params {param}: {accuracy}")

    # grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy')
    # grid_search.fit(X_train, y_train)
    
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

In [53]:
best_params['XGBoost'] = {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 200}
best_params['Random Forest'] = {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
best_params['SVM'] = {'C': 10, 'kernel': 'poly'}

In [64]:
X_train2 = pd.concat([X_train, X_dev])
y_train2 = pd.concat([y_train, y_dev])
model = XGBClassifier(random_state=42)
model.fit(X_train2, y_train2)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Accuracy: 0.7238605898123325
F1 Score: 0.7161189313272656


In [63]:
for name, classifier in classifiers.items():
    model = classifier['model']
    model.set_params(**best_params[name])
    # merge training and development data
    X_train2 = pd.concat([X_train, X_dev])
    y_train2 = pd.concat([y_train, y_dev])
    model.fit(X_train2, y_train2)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\n{name} Classifier (with best parameters):")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")


XGBoost Classifier (with best parameters):
Accuracy: 0.6783
Precision: 0.6774
Recall: 0.6783
F1-score: 0.6752

Random Forest Classifier (with best parameters):
Accuracy: 0.6622
Precision: 0.6643
Recall: 0.6622
F1-score: 0.6445

SVM Classifier (with best parameters):
Accuracy: 0.7399
Precision: 0.7382
Recall: 0.7399
F1-score: 0.7385


In [55]:
# Define the classifiers
classifiers = {
    'XGBoost': XGBClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
    }

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f'{name} Classifier')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')

XGBoost Classifier
Accuracy: 0.6729
Precision: 0.6731
Recall: 0.6729
F1 Score: 0.6670
Random Forest Classifier
Accuracy: 0.6649
Precision: 0.6637
Recall: 0.6649
F1 Score: 0.6515
SVM Classifier
Accuracy: 0.7319
Precision: 0.7316
Recall: 0.7319
F1 Score: 0.7263


In [45]:
# Define the classifiers
classifiers = {
    'XGBoost': XGBClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42)
    }

for name, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f'{name} Classifier')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    

XGBoost Classifier
Accuracy: 0.7239
Precision: 0.7317
Recall: 0.7239
F1 Score: 0.7161
Random Forest Classifier
Accuracy: 0.6890
Precision: 0.6922
Recall: 0.6890
F1 Score: 0.6780
SVM Classifier
Accuracy: 0.7292
Precision: 0.7338
Recall: 0.7292
F1 Score: 0.7215


In [11]:
df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Random State'])

for i in range(10):
    print(f"Random State: {i}")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    
    for name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # create a new DataFrame with the results and concatenate it with the existing DataFrame
        new_row = pd.DataFrame({
            'Classifier': [name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1-score': [f1],
            'Random State': [i]
        })
        df = pd.concat([df, new_row], ignore_index=True)

Random State: 0


  df = pd.concat([df, new_row], ignore_index=True)


Random State: 1
Random State: 2
Random State: 3
Random State: 4
Random State: 5
Random State: 6
Random State: 7
Random State: 8
Random State: 9


In [12]:
df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1-score,Random State
0,XGBoost,0.804582,0.802556,0.804582,0.801733,0
1,Random Forest,0.780323,0.779837,0.780323,0.772709,0
2,SVM,0.787062,0.783315,0.787062,0.781497,0
3,XGBoost,0.791105,0.790177,0.791105,0.789213,1
4,Random Forest,0.778976,0.784769,0.778976,0.775157,1
5,SVM,0.760108,0.760228,0.760108,0.756177,1
6,XGBoost,0.814016,0.814767,0.814016,0.809016,2
7,Random Forest,0.792453,0.798062,0.792453,0.785641,2
8,SVM,0.792453,0.795234,0.792453,0.786317,2
9,XGBoost,0.819407,0.826593,0.819407,0.817405,3


In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the classifiers and their hyperparameters
classifiers = {
    'XGBoost': {
        'model': XGBClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.3],
            'max_depth': [3, 5, 7]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf', 'poly'],
            'degree': [2, 3, 4]
        }
    }
}

# Perform k-fold cross-validation for each classifier
best_params = {}
for name, classifier in classifiers.items():
    model = classifier['model']
    params = classifier['params']
    
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {grid_search.best_params_}")

KeyboardInterrupt: 

In [None]:
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

# Train the models with the best parameters on the full training data and evaluate on the test set
    for name, classifier in classifiers.items():
        model = classifier['model']
        model.set_params(**best_params[name])
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        print(f"\n{name} Classifier (with best parameters):")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")

In [None]:
print(f"\n{name} Classifier (with best parameters):")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-score: {f1:.4f}")