In [None]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from my_utils import *

In [2]:

source_domains = ["wiki", "news", "religious"]
models = {}
vectorizers = {}
results = {}
txts = {}
golds = {}

for domain in source_domains:
    txts_dev, golds_dev = read_data(f"../langid4/data/domain.0.{domain}.dev")
    txts[domain] = txts_dev
    golds[domain] = golds_dev

    nb = load_model(f"../models/naive_bayes/{domain}")
    vectorizer = load_model(f"../models/naive_bayes/vectorizers/{domain}")
    models[domain] = nb
    vectorizers[domain] = vectorizer

    x_dev = vectorizer.transform(txts_dev)

    pred = nb.predict(x_dev)
    pred = list(map(lambda x: str(x), pred))

    df = pd.DataFrame({"txt":txts_dev,"gold":golds_dev,"pred":pred})
    results[domain] = df


### Cross-domain feature selection approach

This approach works, but reports slightly worse accuracy results compared to the baseline model using the 'combined' data.

In [3]:
def get_common_features(vectorizers):
    all_feature_sets = [set(vec.get_feature_names_out()) for vec in vectorizers.values()]
    
    common_features = set.intersection(*all_feature_sets)
    
    return list(common_features)

def create_domain_agnostic_vectorizer(vectorizers):
    common_features = get_common_features(vectorizers)
    
    domain_agnostic_vectorizer = CountVectorizer()
    
    domain_agnostic_vectorizer.vocabulary_ = {
        feature: idx for idx, feature in enumerate(common_features)
    }
    
    return domain_agnostic_vectorizer

def train_domain_agnostic_model(vectorizers, models):
    domain_agnostic_vectorizer = create_domain_agnostic_vectorizer(vectorizers)
    
    combined_texts = []
    combined_labels = []
    
    for domain in source_domains:
        txts_dev = txts[domain]
        golds_dev = golds[domain]
        
        combined_texts.extend(txts_dev)
        combined_labels.extend(golds_dev)
    
    domain_agnostic_model = MultinomialNB()
    
    x_combined = domain_agnostic_vectorizer.transform(combined_texts)
    
    domain_agnostic_model.fit(x_combined, combined_labels)
    
    return domain_agnostic_vectorizer, domain_agnostic_model


(vectorizer, model) = train_domain_agnostic_model(vectorizers, models)


In [None]:
transformed = vectorizer.transform(["Hello, my name's jose"])

model.predict(transformed)

In [None]:
vectorizer.get_feature_names_out().__len__()

In [None]:
vectorizers["news"].get_feature_names_out().__len__()

In [None]:
from sklearn.metrics import accuracy_score

from my_utils import read_data

source_domains = ["wiki", "news", "religious", "combined"]
test_domains = ["wiki", "news", "religious", "rights", "social"]
seed_number = 0
scores = []


for test_domain in test_domains:
    dev_path = f"../langid4/data/domain.{seed_number}.{test_domain}.dev"
    txts_dev, golds_dev = read_data(dev_path)
    x_dev = vectorizer.transform(txts_dev)

    y_pred = model.predict(x_dev)
    accuracy = accuracy_score(golds_dev, y_pred)
    scores.append(accuracy)
    print((f"Agnostic / {test_domain}: {accuracy}"))

print(scores)

#                wiki   news   reli   comb
# avgs           87.64, 86.54, 85.18, 91.82
# avgs_glot500   83.14, 81.82, 81.16, 89.48

# agnostic avg   80.44

### Feature weight based approach

In [None]:
def aggregate_weights(vectorizers, models):
    feature_weights = {}
    
    for domain, vectorizer in vectorizers.items():
        feature_names = vectorizer.get_feature_names_out()
        
        model = models[domain]
        class_log_prior = model.class_log_prior_
        feature_log_prob = model.feature_log_prob_
        
        for class_idx in range(len(class_log_prior)):
            for feature_idx, feature_name in enumerate(feature_names):
                weight = feature_log_prob[class_idx][feature_idx]
                
                if feature_name not in feature_weights:
                    feature_weights[feature_name] = []
                
                feature_weights[feature_name].append(weight)
    
    avg_feature_weights = {
        feature: np.mean(weights) 
        for feature, weights in feature_weights.items()
    }
    
    return avg_feature_weights

def create_domain_agnostic_weights(vectorizers, models):
    avg_feature_weights = aggregate_weights(vectorizers, models)
    
    important_features = sorted(
        avg_feature_weights.items(), 
        key=lambda x: abs(x[1]), 
        reverse=True
    )
    
    top_n = 1000
    selected_features = [feat for feat, _ in important_features[:top_n]]
    
    domain_agnostic_vectorizer = vectorizers[list(vectorizers.keys())[0]].__class__()
    domain_agnostic_vectorizer.vocabulary_ = {
        feature: idx for idx, feature in enumerate(selected_features)
    }
    
    combined_texts = []
    combined_labels = []
    
    for domain in source_domains:
        combined_texts.extend(txts[domain])
        combined_labels.extend(golds[domain])
    
    
    domain_agnostic_model = MultinomialNB()
    x_combined = domain_agnostic_vectorizer.transform(combined_texts)
    domain_agnostic_model.fit(x_combined, combined_labels)
    
    return domain_agnostic_vectorizer, domain_agnostic_model


(vectorizer_b, model_b) = create_domain_agnostic_weights(vectorizers, models)



In [None]:
from sklearn.metrics import accuracy_score

from my_utils import read_data, load_model, identity

source_domains = ["wiki", "news", "religious", "combined"]
test_domains = ["wiki", "news", "religious", "rights", "social"]
seed_number = 0
scores = []


for test_domain in test_domains:
    dev_path = f"../langid4/data/domain.{seed_number}.{test_domain}.dev"
    txts_dev, golds_dev = read_data(dev_path)
    x_dev = vectorizer_b.transform(txts_dev)

    y_pred = model_b.predict(x_dev)
    accuracy = accuracy_score(golds_dev, y_pred)
    scores.append(accuracy)
    print((f"Agnostic / {test_domain}: {accuracy}"))

print(scores)