In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/south-african-language-identification-hack-2023/sample_submission.csv
/kaggle/input/south-african-language-identification-hack-2023/test_set.csv
/kaggle/input/south-african-language-identification-hack-2023/train_set.csv


# Importing dependencies

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline
import nltk
import re
import string
nltk.download('stopwords')
from nltk.corpus import stopwords
%timeit
!pip install nlppreprocess
#Loading Datasets,
#Performing an EDA,
#Data Processing and Engineering,
#Model development through the use of Classifiers and F1 scores,
#Training Model,
#Submission of Model to Kaggle.

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Collecting nlppreprocess
  Downloading nlppreprocess-1.0.2-py3-none-any.whl (5.1 kB)
Installing collected packages: nlppreprocess
Successfully installed nlppreprocess-1.0.2
[0m

# Setting Constants

In [3]:
Const = 31 #random state

# Importing Data

In [4]:
df = pd.read_csv("/kaggle/input/south-african-language-identification-hack-2023/train_set.csv")
df_proxy = df.copy() #creating a backup dataset for EDA
df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


Train data has 2 rows and 33000 columns

In [5]:
df.shape

(33000, 2)

There are no null entries in the dataset.

In [6]:
df.isnull().sum()

lang_id    0
text       0
dtype: int64

The dataset has two columns titled `lang_id` and `text` respectively.

In [7]:
df.columns


Index(['lang_id', 'text'], dtype='object')

Setting generic variables.
 - Setting `lang_id` as `label` 
 - Setting `text` as `feature`


In [8]:
label = 'lang_id'
ft = 'text'

There are 11 different classificaton groups.

In [9]:
class_count = df[label].unique()
len(class_count)

11

In [10]:
df[label].describe()

count     33000
unique       11
top         xho
freq       3000
Name: lang_id, dtype: object

# Data Preprocessing


In [11]:
df_proxy['len'] = [len(text) for text in df_proxy[ft]]


In [12]:
stopword=set(stopwords.words('english'))  

def clean(text):
    text = str(text).lower() 
    text = re.sub(r'[-]',' ',text)
    text = re.sub(r'[_]', ' ', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub('[0-9]+', '', text)
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text =" ".join(text)
    return text

df_proxy['clean'] = df_proxy['text'].apply(clean)


In [13]:
df_proxy.head()

Unnamed: 0,lang_id,text,len,clean
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,220,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,252,dha iya kuba nobulumko bokubeka umsebenzi naph...
2,eng,the province of kwazulu-natal department of tr...,264,province kwazulu natal department transport in...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,217,netefata gore ba file dilo ka moka te le dumel...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,239,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [14]:
df['clean'] = df[ft].apply(clean)

# Feature Engineering

In [15]:
# Importing Label Encoder, Cross Validator, Hyperparameter Tuners and Train Test Split  
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import KFold

## Importing Models

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import RidgeClassifier

## Preparing The Data For Model Training

In [17]:
# Importing metrics, pipeline and vectorizers
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
# Setting indepedent and target variables separetely
X = df[ft]
y = df[label]

In [19]:
# Encoding the y varialble
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [20]:
# Storing names of models in a list for evaluation
clf_names = ['RidgeClassifier',
          'MultinomialNB']
# Storing models in a list for evaluation
clf_list = [RidgeClassifier(random_state=Const),MultinomialNB()]

In [21]:
# Splitting the data into a train and test set 
# stratifying according to the dependant variable to preserve the proportion of different classes in the 
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.5, 
                                                       random_state=Const)

In [22]:
def performance_tester(X_train, y_train, X_test, y_test, classifiers, clf_name):
    model_stats = {}  # Initialize dictionary to store performance statistics

    for name, clf in zip(clf_name, classifiers):

        model = Pipeline([('tfidf', TfidfVectorizer(min_df=0.05, max_df=0.9,
                                    ngram_range=(1, 5),
                                    analyzer='char')),
                          ('clf', clf)])

        model.fit(X_train, y_train)  # Train the model
        result = %timeit -q -o model.fit(X_train, y_train) # Logging a runtime for each model
        model_pred = model.predict(X_test)  # Make predictions on the testing data

        # Compute performance metrics and store in model_stats dictionary
        model_stats[name] = {
            'F1-Macro': metrics.f1_score(y_test, model_pred, average='macro'),
            'F1-Accuracy': metrics.f1_score(y_test, model_pred, average='micro'),
            'F1-Weighted': metrics.f1_score(y_test, model_pred, average='weighted'),
            'Run-Time': result.best

        }

    return pd.DataFrame.from_dict(model_stats, orient='index')

In [23]:
performance_result = performance_tester(X_train ,y_train, X_test, y_test, clf_list, clf_names)
performance_result = performance_result.sort_values('F1-Weighted', ascending=False)

In [24]:
performance_result

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted,Run-Time
RidgeClassifier,0.995577,0.995576,0.995577,28.763383
MultinomialNB,0.993154,0.993152,0.993154,12.871521


In [25]:
performance_result.shape


(2, 4)

In [26]:
#p_proxy = performance_result.copy()
#p_cols = [col for col in p_proxy.columns != "Run-Time"]


# Hyperparameter Tuning

In [27]:
def param_tuner(X_train, y_train, classifiers, clf_name):
    best_params = {}

    for name, clf in zip(clf_name, classifiers):
        model = Pipeline([('tfidf', TfidfVectorizer(max_df=0.9, ngram_range=(1, 5), analyzer='char')),
                          ('clf', clf)])
        model.fit(X_train, y_train)  # Training

        # Get models performing parameters
        params = model.get_params()
        model_params = {}

        for key in params:
            if key.startswith("clf"):
                if len(key) < 5:
                    model_params['model'] = params[key]
            else:
                model_params[key[5:]] = params[key]
        
        best_params[name] = model_params
    
    return best_params

In [28]:
best_params = param_tuner(X_train, y_train, clf_list, clf_names)

## Conducting Grid Search Cross Validation


### Applying Ridge Classifier

In [29]:
#model
rclf = RidgeClassifier()

In [30]:
vect = TfidfVectorizer(max_df=0.9, ngram_range=(1, 5), analyzer= 'char')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [31]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=Const)

In [32]:
best_params['RidgeClassifier']

{'y': None,
 '': TfidfVectorizer(analyzer='char', max_df=0.9, ngram_range=(1, 5)),
 'se': False,
 'model': RidgeClassifier(random_state=31),
 '__analyzer': 'char',
 '__binary': False,
 '__decode_error': 'strict',
 '__dtype': numpy.float64,
 '__encoding': 'utf-8',
 '__input': 'content',
 '__lowercase': True,
 '__max_df': 0.9,
 '__max_features': None,
 '__min_df': 1,
 '__ngram_range': (1, 5),
 '__norm': 'l2',
 '__preprocessor': None,
 '__smooth_idf': True,
 '__stop_words': None,
 '__strip_accents': None,
 '__sublinear_tf': False,
 '__token_pattern': '(?u)\\b\\w\\w+\\b',
 '__tokenizer': None,
 '__use_idf': True,
 '__vocabulary': None}

In [33]:
alpha = list(np.linspace(0.1,0.5, 5))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator = rclf,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [34]:
grid_search.fit(X_train, y_train)
rclf_prediction = grid_search.predict(X_test)
rclf_cv_score = grid_search.best_score_
rclf_test_score = grid_search.score(X_test, y_test)

In [35]:
print(f'Ridge Classifier Cross-Validation Score: {rclf_cv_score}')
print(f'Ridge Test score: {rclf_test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Ridge Classifier Cross-Validation Score: 0.9992731224217275
Ridge Test score: 0.9990908480065919


### Applying Multinomial Naïve Bayes

In [36]:
#model
mnbclf = MultinomialNB()

In [37]:
best_params['MultinomialNB']

{'y': None,
 '': TfidfVectorizer(analyzer='char', max_df=0.9, ngram_range=(1, 5)),
 'se': False,
 'model': MultinomialNB(),
 '__analyzer': 'char',
 '__binary': False,
 '__decode_error': 'strict',
 '__dtype': numpy.float64,
 '__encoding': 'utf-8',
 '__input': 'content',
 '__lowercase': True,
 '__max_df': 0.9,
 '__max_features': None,
 '__min_df': 1,
 '__ngram_range': (1, 5),
 '__norm': 'l2',
 '__preprocessor': None,
 '__smooth_idf': True,
 '__stop_words': None,
 '__strip_accents': None,
 '__sublinear_tf': False,
 '__token_pattern': '(?u)\\b\\w\\w+\\b',
 '__tokenizer': None,
 '__use_idf': True,
 '__vocabulary': None}

In [38]:
alpha = list(np.linspace(0.1,0.01,5))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= mnbclf,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [39]:
grid_search.fit(X_train, y_train)
mnbclf_prediction = grid_search.predict(X_test)
mnbclf_cv_score = grid_search.best_score_
mnbclf_test_score = grid_search.score(X_test, y_test)

In [40]:
print(f'Multinomial Naive Bayes Cross-Validation Score: {mnbclf_cv_score}')
print(f'Test score: {mnbclf_test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Multinomial Naive Bayes Cross-Validation Score: 0.9997576755911808
Test score: 0.9995150703903073


### Implementing Classifier Stacks

In [41]:
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

In [42]:
df['clean'] = df[ft].apply(clean)
df.head()

Unnamed: 0,lang_id,text,clean
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,dha iya kuba nobulumko bokubeka umsebenzi naph...
2,eng,the province of kwazulu-natal department of tr...,province kwazulu natal department transport in...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,netefata gore ba file dilo ka moka te le dumel...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [43]:
X = df['clean']
y = df['lang_id']

y = encoder.fit_transform(y)



In [44]:
X_train , X_test , y_train , y_test = train_test_split(X, y,  stratify=y, test_size=0.4, random_state =1)

In [45]:
vect = TfidfVectorizer( max_df=0.9, ngram_range=(1, 5), analyzer= 'char')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [46]:
multiNB1 = MultinomialNB(alpha=0.1)

estimators1 = [('multiNB1', multiNB1)]
final_est = RidgeClassifier(alpha=0.3)

In [47]:
clf_stack1 = StackingClassifier(estimators = estimators1,
                           final_estimator = final_est,
                           passthrough = True)

In [48]:
clf_stack1.fit(X_train , y_train)

In [49]:
pred_stack1 = clf_stack1.predict(X_test)

In [50]:
model_stats = {}
model_stats['clf_Stack1'] = {
        'F1-Macro':metrics.f1_score(y_test, pred_stack1, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred_stack1, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred_stack1, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
clf_Stack1,0.999848,0.999848,0.999848


In [51]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)

estimators2 = [('multiNB1', multiNB1), ('multiNB2', multiNB2)]
final_est = RidgeClassifier(alpha=0.3)

In [52]:
clf_stack2 = StackingClassifier(estimators = estimators2,
                           final_estimator = final_est,
                           passthrough = True)

In [53]:
clf_stack2.fit(X_train , y_train)

In [54]:
pred_stack2 = clf_stack2.predict(X_test)

In [55]:
model_stats['clf_Stack2'] = {
        'F1-Macro':metrics.f1_score(y_test, pred_stack2, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred_stack2, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred_stack2, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
clf_Stack1,0.999848,0.999848,0.999848
clf_Stack2,0.999848,0.999848,0.999848


In [56]:
dt = pd.read_csv("/kaggle/input/south-african-language-identification-hack-2023/test_set.csv")
dt.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [57]:
dt['clean'] = dt[ft].apply(clean)
dt.head()

Unnamed: 0,index,text,clean
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",mmasepala fa maemo kgethegileng letlelela kgat...
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,tshivhumbeo tshi fana na ngano dza vhathu
3,4,Kube inja nelikati betingevakala kutsi titsini...,kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.,winste op buitelandse valuta


In [58]:
X = dt['clean']
vectorize = vect.transform(X)
dt[label] = clf_stack1.predict(vectorize)

In [59]:
submission = dt[['index', 'lang_id']]
submission.to_csv('Submission.csv',index=False)
submission

Unnamed: 0,index,lang_id
0,1,6
1,2,2
2,3,8
3,4,5
4,5,0
...,...,...
5677,5678,1
5678,5679,3
5679,5680,4
5680,5681,4


In [60]:
dt.clean

0       mmasepala fa maemo kgethegileng letlelela kgat...
1       uzakwaziswa ngokufaneleko nakungafuneka eminye...
2               tshivhumbeo tshi fana na ngano dza vhathu
3       kube inja nelikati betingevakala kutsi titsini...
4                            winste op buitelandse valuta
                              ...                        
5677                                  mark ballot private
5678    ge ka kgetha ka bowena go se omie mofani ka ti...
5679    e ka kopo etsa kgetho ya hao ka hloko hobane h...
5680    tb ke bokudi ba pmb mme morero tla lefella tlh...
5681                 vakatjhela iwebhusayidi yethu ku www
Name: clean, Length: 5682, dtype: object

# Solution Reference

[Frank6496 on GitHub](https://github.com/Frank6496/Language-Identification-Hackathon)