In [24]:
import joblib
import time
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

class ImprovedLanguageClassifier:
    def __init__(self, 
                 ngram_range=(1, 4),
                 alpha=0.05,
                 max_features=10000,
                 use_tfidf=True,
                 model_type='naive_bayes',
                 class_prior=None):
        
        if use_tfidf:
            self.vectorizer = TfidfVectorizer(
                analyzer='char',
                ngram_range=ngram_range,
                lowercase=True,
                max_features=max_features,
                sublinear_tf=True
            )
        else:
            self.vectorizer = CountVectorizer(
                analyzer='char',
                ngram_range=ngram_range,
                lowercase=True,
                max_features=max_features
            )
        
        if model_type == 'svm':
            self.classifier = LinearSVC(C=1.0, class_weight='balanced')
        else:
            self.classifier = MultinomialNB(alpha=alpha, class_prior=class_prior)
        
        self.model = Pipeline([
            ('vectorizer', self.vectorizer),
            ('classifier', self.classifier)
        ])
        
        self.languages = None
        self.model_type = model_type
        self.class_prior = class_prior
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        if self.model_type == 'naive_bayes':
            return self.model.predict_proba(X)
        else:
            decision_values = self.model.decision_function(X)
            if decision_values.ndim == 1:
                decision_values = np.column_stack([-decision_values, decision_values])
            
            exp_scores = np.exp(decision_values - np.max(decision_values, axis=1, keepdims=True))
            proba = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
            return proba

model_path = "models/improved_language_classifier.joblib"
word_to_classify = "acliq"

model = joblib.load(model_path)

start_time = time.time()
predicted_language = model.predict([word_to_classify])[0]
probabilities = model.predict_proba([word_to_classify])[0]
confidence = max(probabilities) * 100
end_time = time.time()

elapsed_ms = (end_time - start_time) * 1000

print(f"Word: '{word_to_classify}'")
print(f"Language: {predicted_language}")
print(f"Confidence: {confidence:.2f}%")
print(f"Classification time: {elapsed_ms:.2f} ms")

Word: 'acliq'
Language: az
Confidence: 67.99%
Classification time: 3.48 ms
