In [None]:
# import libraries
import re
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [None]:
# ------------------------------------------------------------------------------
# 1. Custom Transformer for text cleaning
# ------------------------------------------------------------------------------
class TextCleaner(BaseEstimator, TransformerMixin):
    """
    Text cleaning transformer:
    - Lowercase all characters
    - Replace special whitespace characters (\xa0, \x7f)
    - Remove non-alphanumeric characters (keep spaces)
    """
    def fit(self, X, y=None):
        # No fitting necessary for cleaning
        return self

    def transform(self, X, y=None):
        # X is a pandas Series of text strings
        cleaned = (
            X
            .str.lower()                                    # lowercase text
            .str.replace('\xa0', ' ', regex=False)         # replace non‑breaking space
            .str.replace('\x7f', ' ', regex=False)         # replace delete character
            .apply(lambda txt: re.sub(r'[^a-z0-9\s]', ' ', txt))  # remove non‑alphanumeric
        )
        return cleaned

In [None]:
# ------------------------------------------------------------------------------
# 2. Custom Transformer for stylometric features
# ------------------------------------------------------------------------------
class StylometricFeatures(BaseEstimator, TransformerMixin):
    """
    Extract stylometric features:
    - Average word length
    - Type-token ratio
    - Punctuation ratio
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        feature_list = []
        for txt in X:
            tokens = txt.split()                     # split text on whitespace
            n_tokens = len(tokens) or 1              # avoid division by zero
            # 1) Average word length
            avg_len = np.mean([len(token) for token in tokens]) if tokens else 0.0
            # 2) Type‑token ratio
            ttr = len(set(tokens)) / n_tokens
            # 3) Punctuation count and ratio
            punct_count = len(re.findall(r'[^\w\s]', txt))
            punct_ratio = punct_count / len(txt) if txt else 0.0
            feature_list.append([avg_len, ttr, punct_ratio])
        return np.array(feature_list)

In [None]:
# ------------------------------------------------------------------------------
# 3. Function to load data from a CSV file
# ------------------------------------------------------------------------------
def load_data(path: str) -> pd.DataFrame:
    """
    Load labeled data from a CSV file.
    The CSV must contain 'text' and 'author' columns.
    """
    df = pd.read_csv(path)
    assert 'text' in df.columns and 'author' in df.columns, \
        "CSV file must contain 'text' and 'author' columns"
    return df

In [None]:
# ------------------------------------------------------------------------------
# 4. Main function: training, evaluation, and model saving
# ------------------------------------------------------------------------------
def main():
    # Set random seed for reproducibility
    np.random.seed(42)

    # 4.1 Load the data
    data = load_data('author_data.csv') ### Path to your CSV file
    X = data['text']
    y = data['author']

    # 4.2 Split data into train (70%), validation (15%), and test (15%)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.30, stratify=y, random_state=42
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
    )

    # 4.3 Build a pipeline: cleaning + feature extraction + classifier
    pipeline = Pipeline([
        ('cleaner', TextCleaner()),
        ('features', FeatureUnion([
            ('tfidf', TfidfVectorizer(
                ngram_range=(1,2),    # use unigrams and bigrams
                max_features=20000    # limit number of features
            )),
            ('stylometric', StylometricFeatures())
        ])),
        ('classifier', LogisticRegression(
            max_iter=1000,
            random_state=42
        ))
    ])

    # 4.4 Train the model
    pipeline.fit(X_train, y_train)

    # 4.5 Evaluate on validation set
    y_val_pred = pipeline.predict(X_val)
    print("=== Validation Results ===")
    print(classification_report(y_val, y_val_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

    # 4.6 Evaluate on test set
    y_test_pred = pipeline.predict(X_test)
    print("\n=== Test Results ===")
    print(classification_report(y_test, y_test_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

    # 4.7 Save the trained pipeline to disk
    joblib.dump(pipeline, 'author_classifier.pkl')
    print("\nModel saved to author_classifier.pkl")

In [None]:
if __name__ == '__main__':
    main()