In [None]:
import random
import pandas as pd
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from bornrule import BornClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# Add a seed for reproducibility
random.seed(42)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, message="Variables are collinear")

def main():

    df = pd.read_csv(r"")
    
    # Keep only rows where both classification columns are in ["Formal/Polite", "Informal", "Derogatory", "Taboo"]
    allowed_classes = ["Formal/Polite", "Informal", "Derogatory", "Taboo"]
    df = df[
        df['original_sentence_classification_experts_classification'].isin(allowed_classes) &
        df['paraphrased_sentence_classification_experts_classification'].isin(allowed_classes)
    ]
    
    #  Create a list of (original_sentence, original_label, paraphrased_sentence, paraphrased_label)
    pairs = []
    for _, row in df.iterrows():
        pairs.append((
            row['original_sentence'],
            row['original_sentence_classification_experts_classification'],
            row['paraphrased_sentence'],
            row['paraphrased_sentence_classification_experts_classification']
        ))
    
    # Shuffle the pairs so that training/test splits are random
    random.shuffle(pairs)
    
    # Train/test split on pairs (ensuring that each original and its paraphrase remain together)
    total_pairs = len(pairs)
    train_count = int(0.8 * total_pairs)
    train_pairs = pairs[:train_count]
    test_pairs  = pairs[train_count:]
    
    # Flatten the pairs into separate lists for X (sentences) and y (labels)
    def flatten_pairs(pair_list):
        X, y = [], []
        for orig_sent, orig_label, para_sent, para_label in pair_list:
            X.append(orig_sent)
            y.append(orig_label)
            X.append(para_sent)
            y.append(para_label)
        return X, y
    
    X_train, y_train = flatten_pairs(train_pairs)
    X_test, y_test   = flatten_pairs(test_pairs)
    
    # Preprocess text using CountVectorizer (or maybe change vectoriser?)
    vectorizer = CountVectorizer()
    X_train_vect = vectorizer.fit_transform(X_train)
    X_test_vect  = vectorizer.transform(X_test)
    
    X_train_dense = X_train_vect.toarray()
    X_test_dense = X_test_vect.toarray()
    
    # Dictionary to store results for each classifier:
    # Key: classifier name; Value: dictionary with 'train_report', 'test_report', 'avg_f1'
    results = {}
    
    # Tracking best classifier based on average test F1 (average of macro and weighted F1)
    best_avg_f1 = -1
    best_classifier_name = ""
    
    def compute_avg_f1(report):
        macro_f1 = report["macro avg"]["f1-score"]
        weighted_f1 = report["weighted avg"]["f1-score"]
        return (macro_f1 + weighted_f1) / 2.0
    
    # Define the classifiers.
    # Each tuple is (name, classifier instance, requires_dense)
    # "requires_dense" is True if the classifier cannot work with sparse matrix data.
    classifiers = []
    # Include BornClassifier (original Born's Rule)
    classifiers.append(("BornClassifier", BornClassifier(a=0.5, b=1.0, h=1.0), False))
    # Other classifiers:
    classifiers.append(("Nearest Neighbors", KNeighborsClassifier(), False))
    classifiers.append(("Linear SVM", LinearSVC(dual=False, max_iter=5000), False))
    classifiers.append(("Random Forest", RandomForestClassifier(), True))
    classifiers.append(("Neural Net", MLPClassifier(max_iter=1000), True))
    classifiers.append(("Naive Bayes", MultinomialNB(), False))
    
    # 8. Define parameter grids for hyperparameter tuning
    param_grids = {
        "BornClassifier": {"a": [0.1,0.3,0.5,0.7,0.9], "b": [0.3,0.5, 1.5, 2.0,1], "h": [0.3,0.5, 1.5, 2.0,1]},
        "Nearest Neighbors": {"n_neighbors": [2,3, 5, 7, 9,10,15,20]},
        "Linear SVM": {"C": [0.01,0.05,0.1, 1, 10,50]},
        "Random Forest": {"n_estimators": [50, 100, 200,300], "max_depth": [None, 10, 20,50], "min_samples_split": [2, 5,10]},
        "Neural Net": {"hidden_layer_sizes": [(50,), (100,), (500,)], "activation": ["relu", "tanh"], "alpha": [0.0001, 0.001,0.01]},
        "Naive Bayes": {"alpha": [0.1,0.5, 1.0, 1.5]},
    }
    
    #  Train and evaluate each classifier
    for name, clf, requires_dense in classifiers:
        print("=" * 80)
        print(f"Classifier: {name}")
        
        # Choose the appropriate input (dense or sparse)
        if requires_dense:
            X_train_used = X_train_dense
            X_test_used = X_test_dense
        else:
            X_train_used = X_train_vect
            X_test_used = X_test_vect
        
        # Perform grid search if a parameter grid is defined for this classifier
        if name in param_grids and param_grids[name]:
            print(f"Performing grid search for {name}...")
            grid_search = GridSearchCV(clf, param_grids[name], cv=3, n_jobs=-1, scoring='f1_macro')
            grid_search.fit(X_train_used, y_train)
            print(f"Best parameters for {name}: {grid_search.best_params_}")
            clf = grid_search.best_estimator_
        
        # Train classifier (if not already fitted by grid search)
        clf.fit(X_train_used, y_train)
        
        # Evaluate on the training set
        y_train_pred = clf.predict(X_train_used)
        train_report_str = classification_report(y_train, y_train_pred, zero_division=0)
        # Evaluate on the test set
        y_test_pred = clf.predict(X_test_used)
        test_report_str = classification_report(y_test, y_test_pred, zero_division=0)
        test_report = classification_report(y_test, y_test_pred, output_dict=True, zero_division=0)
        
        # Compute average F1 (average of macro avg and weighted avg)
        avg_f1 = compute_avg_f1(test_report)
        
        # Save results
        results[name] = {
            'train_report': train_report_str,
            'test_report': test_report_str,
            'avg_f1': avg_f1
        }
        
        # Update best classifier if this one is higher on averae F1
        if avg_f1 > best_avg_f1:
            best_avg_f1 = avg_f1
            best_classifier_name = name
    
    print("=" * 80)
    print("\nResults Summary:\n")
    # Print results for each classifier; bold the name of the best classifier.
    for name, res in results.items():
        display_name = f"**{name}**" if name == best_classifier_name else name
        print(f"Classifier: {display_name}")
        print("-" * 40)
        print("Training Set Results:")
        print(res['train_report'])
        print("Test Set Results:")
        print(res['test_report'])
        print("\n")

if __name__ == "__main__":
    main()


Classifier: BornClassifier
Performing grid search for BornClassifier...
Best parameters for BornClassifier: {'a': 0.5, 'b': 1, 'h': 0.3}
Classifier: Nearest Neighbors
Performing grid search for Nearest Neighbors...
Best parameters for Nearest Neighbors: {'n_neighbors': 2}
Classifier: Linear SVM
Performing grid search for Linear SVM...
Best parameters for Linear SVM: {'C': 0.1}
Classifier: RBF SVM
Performing grid search for RBF SVM...
Best parameters for RBF SVM: {'C': 10, 'gamma': 'scale'}
Classifier: Random Forest
Performing grid search for Random Forest...
Best parameters for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Classifier: Neural Net
Performing grid search for Neural Net...
Best parameters for Neural Net: {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (100,)}
Classifier: AdaBoost
Performing grid search for AdaBoost...
Best parameters for AdaBoost: {'learning_rate': 1.5, 'n_estimators': 100}
Classifier: Naive Bayes
Performing g