In [None]:
pip install pandas==2.2.3 scikit-learn==1.6.1 matplotlib==3.10.1 joblib==1.4.2 seaborn==0.13.2

In [None]:
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import time

class ImprovedLanguageClassifier:
    def __init__(self, 
                 ngram_range=(1, 4),
                 alpha=0.05,
                 max_features=10000,
                 use_tfidf=True,
                 model_type='naive_bayes',
                 class_prior=None):
        
        if use_tfidf:
            self.vectorizer = TfidfVectorizer(
                analyzer='char',
                ngram_range=ngram_range,
                lowercase=True,
                max_features=max_features,
                sublinear_tf=True
            )
        else:
            self.vectorizer = CountVectorizer(
                analyzer='char',
                ngram_range=ngram_range,
                lowercase=True,
                max_features=max_features
            )
        
        if model_type == 'svm':
            self.classifier = LinearSVC(C=1.0, class_weight='balanced')
        else:
            self.classifier = MultinomialNB(alpha=alpha, class_prior=class_prior)
        
        self.model = Pipeline([
            ('vectorizer', self.vectorizer),
            ('classifier', self.classifier)
        ])
        
        self.languages = None
        self.model_type = model_type
        self.class_prior = class_prior
    
    def fit(self, X, y):
        if self.class_prior is None and self.model_type == 'naive_bayes':
            class_counts = pd.Series(y).value_counts()
            total = len(y)
            class_weights = {cls: total / count for cls, count in class_counts.items()}
            sum_weights = sum(class_weights.values())
            self.class_prior = np.array([class_weights[cls] / sum_weights for cls in sorted(class_weights.keys())])
            
            if self.model_type == 'naive_bayes':
                self.model.named_steps['classifier'] = MultinomialNB(
                    alpha=self.model.named_steps['classifier'].alpha,
                    class_prior=self.class_prior
                )
        
        self.model.fit(X, y)
        self.languages = list(sorted(set(y)))
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        if self.model_type == 'naive_bayes':
            return self.model.predict_proba(X)
        else:
            decision_values = self.model.decision_function(X)
            if decision_values.ndim == 1:
                decision_values = np.column_stack([-decision_values, decision_values])
            
            exp_scores = np.exp(decision_values - np.max(decision_values, axis=1, keepdims=True))
            proba = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
            return proba
    
    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        
        return {
            'accuracy': accuracy,
            'report': report,
            'confusion_matrix': cm,
            'y_pred': y_pred
        }
    
    def save_model(self, filepath):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        joblib.dump(self, filepath)
        print(f"Model saved to {filepath}")
    
    @classmethod
    def load_model(cls, filepath):
        return joblib.load(filepath)
    
    def get_model_size(self):
        return len(self.model.named_steps['vectorizer'].get_feature_names_out())


def find_optimal_parameters(X_train, y_train, X_test, y_test):
    print("Finding optimal parameters...")
    
    param_grid = {
        'vectorizer__ngram_range': [(1, 3), (1, 4), (1, 5)],
        'vectorizer__max_features': [5000, 10000],
        'classifier__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
    }
    
    pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(analyzer='char', lowercase=True, sublinear_tf=True)),
        ('classifier', MultinomialNB())
    ])
    
    grid_search = GridSearchCV(
        pipeline, 
        param_grid, 
        cv=5,
        scoring='accuracy', 
        verbose=1,
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
    
    best_model = grid_search.best_estimator_
    test_accuracy = best_model.score(X_test, y_test)
    print(f"Test set accuracy: {test_accuracy:.4f}")
    
    return grid_search.best_params_, best_model


def main():
    try:
        print("Loading dataset...")
        data = pd.read_csv('cleaned_dataset_with_translit_error.csv')
        print(f"Loaded dataset with {len(data)} records")
        
        data = data.dropna()
        data = data[data['word'].str.strip() != '']
        data = data.drop_duplicates()
        print(f"After cleaning {len(data)} records remain")
        
        print("\nLanguage distribution in dataset:")
        lang_distribution = data['language'].value_counts()
        for lang, count in lang_distribution.items():
            print(f"{lang}: {count} words ({count/len(data)*100:.1f}%)")
        
        X_train, X_test, y_train, y_test = train_test_split(
            data['word'], data['language'], 
            test_size=0.2, random_state=42, stratify=data['language']
        )
        
        print(f"\nData split into training ({len(X_train)} words) and test ({len(X_test)} words) sets")
        
        print("\nTraining improved model...")
        improved_classifier = ImprovedLanguageClassifier(
            ngram_range=(1, 4),
            alpha=0.05,
            max_features=10000,
            use_tfidf=True
        )
        
        start_time = time.time()
        improved_classifier.fit(X_train, y_train)
        training_time = time.time() - start_time
        print(f"Training completed in {training_time:.2f} seconds")
        
        model_path = "models/improved_language_classifier.joblib"
        improved_classifier.save_model(model_path)
        
        print("\nEvaluating improved model on test set:")
        evaluation = improved_classifier.evaluate(X_test, y_test)
        
        print(f"Accuracy: {evaluation['accuracy']:.4f}")
        print("\nClassification report:")
        print(evaluation['report'])
        
        print("\nTesting on problematic words:")
        problem_words = ["kişi", "qadın", "koynek", "hello", "привет"]
        
        for word in problem_words:
            lang = improved_classifier.predict([word])[0]
            probas = improved_classifier.predict_proba([word])[0]
            top_prob = max(probas) * 100
            print(f"Word: '{word}' -> Language: {lang} (confidence: {top_prob:.2f}%)")
        

        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()

Loading dataset...
Loaded dataset with 2140589 records
After cleaning 2140588 records remain

Language distribution in dataset:
ru: 1528909 words (71.4%)
en: 463533 words (21.7%)
az: 148146 words (6.9%)

Data split into training (1712470 words) and test (428118 words) sets

Training improved model...
