South African Language Identification
```



### Import Standard Libraries

In [67]:
%timeit
!pip install nlppreprocess



In [68]:
import re
import string

import pandas as pd
import numpy as np

### Load Dataset

In [69]:
data = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

#Insurance Dataset
data_copy = data.copy()

In [70]:
data.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


### Data Preprocessing

In [71]:
data.lang_id.describe()

count     33000
unique       11
top         xho
freq       3000
Name: lang_id, dtype: object

In [72]:
#Character Length
data_copy['length'] = data_copy['text'].apply(lambda x: len(x))

In [73]:
data.describe()

Unnamed: 0,lang_id,text
count,33000,33000
unique,11,29948
top,xho,ngokwesekhtjheni yomthetho ophathelene nalokhu...
freq,3000,17


## Data engineering /cleaning


In [74]:
#Remove Punctuations
def _remove_punc(x):
    """
    Func removes punctuation and ASCII character
    using string.punctuation function
    
    Args:
        data: pandas dataframe
    Return:
        Dataframe: clean tweets
    """
    x = re.sub(r'[-]',' ',x)
    x = re.sub(r'[_]', ' ', x)
    x = re.sub(r'[^\w\s]','',x)
    x = re.sub('[0-9]+', '', x)
    x = re.sub(r'[^\x00-\x7f]',r'', x)
    return x
#Apply the function to the dataset
data_copy['clean_punc'] = data_copy['text'].apply(_remove_punc)

In [75]:
def _lower(x):
    return x.lower()
data_copy['lower'] = data_copy['clean_punc'].apply(_lower)

In [76]:
from nlppreprocess import NLP
nlp = NLP()
nlp.process('couldnt')

'could not'

In [77]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\X475905\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
#Remove Stopwords
stopword = nltk.corpus.stopwords.words('english')
def remove_stopwords(x):
    """
    Remove stop-word in the dataset to reduce noise
    Args:
        Args:
        data: pandas dataframe
    Return:
        Dataframe:non-stop word
    """
    stopwords = NLP(replace_words=True, remove_stopwords=True, 
                            remove_numbers=True, remove_punctuations=False) 
    x = stopwords.process(x)
    return x
    
data_copy['Text_nonstop'] = data_copy['lower'].apply(lambda x: remove_stopwords(x))

In [79]:
def _analyzer (x):
    """
    Function combines all the cleaning operations
    """
    x = _remove_punc(x)
    x = _lower(x)
    x = remove_stopwords(x)
    return x

In [80]:
data_copy.head()

Unnamed: 0,lang_id,text,length,clean_punc,lower,Text_nonstop
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,220,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,252,i dha iya kuba nobulumko bokubeka umsebenzi na...,i dha iya kuba nobulumko bokubeka umsebenzi na...,dha iya kuba nobulumko bokubeka umsebenzi naph...
2,eng,the province of kwazulu-natal department of tr...,264,the province of kwazulu natal department of tr...,the province of kwazulu natal department of tr...,province kwazulu natal department transport in...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,217,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,239,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [81]:
data['cleaned'] = data['text'].apply(_analyzer)
test['cleaned'] = test['text'].apply(_analyzer)

Feature 
Engineering

In [82]:
# Splitting  X (indepedent) and Y (target/dependent) variables
X = data['cleaned']
y = data['lang_id']

#### Label Encoding

In [83]:
# from sklearn.preprocessing import LabelEncoder
# lab_enc = LabelEncoder()
# y = lab_enc.fit_transform(y)

### Train Test Split

In [84]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.4, 
                                                       random_state=42)

In [85]:
# Models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier


In [86]:
alg = [LogisticRegression(random_state =42 , max_iter=5000) , 
       MultinomialNB(), LinearSVC(random_state=42), 
       SGDClassifier(random_state=42), RidgeClassifier(random_state=42)]

In [87]:
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

In [88]:
def _performace_assesment(*args , **kwargs):
  model_stats = {}
  for clf in alg:
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                      ('clf' , clf)
                      ])
    
    model.fit(X_train, y_train) #Training
    model_pred = model.predict(X_test) #Testing

    # Dictionary of Models Performances
    model_stats[clf.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, model_pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, model_pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, model_pred, average='weighted')}
  return pd.DataFrame.from_dict(model_stats, orient='index')


In [89]:
performance = _performace_assesment(alg , X_train , X_test , y_train , y_test)
performance.to_csv('performance.csv')
dataframe = pd.read_csv('performance.csv', index_col = 0)
dataframe.sort_values('F1-Weighted', ascending=False)

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
MultinomialNB,0.999394,0.999394,0.999394
RidgeClassifier,0.999167,0.999167,0.999167
LinearSVC,0.999167,0.999167,0.999167
SGDClassifier,0.999091,0.999091,0.999091
LogisticRegression,0.998183,0.998182,0.998183


## Hyperparameter Tuning

In [90]:
def _param_tuning(*args , **kwargs):
  best_params = {}

  for clf in alg:
    model = Pipeline([('tfidf', TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')),
                      ('clf' , clf)])
    model.fit(X_train, y_train) #Training
    
    #Get models performing parameters
    params = model.get_params()
    model_name = clf.__class__.__name__ 
    model_name = {}
    for key in params:
      if key.startswith("clf"):
        if len(key) < 5:
          model_name['model'] = params[key]
        else:
            model_name[key[5:]] = params[key]
    best_params[clf.__class__.__name__] = model_name
  return best_params

In [91]:
best_params = _param_tuning(alg, X_train, y_train)

In [92]:
#Best parameters
best_params

{'LogisticRegression': {'model': LogisticRegression(max_iter=5000, random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 5000,
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 42,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'MultinomialNB': {'model': MultinomialNB(),
  'alpha': 1.0,
  'class_prior': None,
  'fit_prior': True},
 'LinearSVC': {'model': LinearSVC(random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': True,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'loss': 'squared_hinge',
  'max_iter': 1000,
  'multi_class': 'ovr',
  'penalty': 'l2',
  'random_state': 42,
  'tol': 0.0001,
  'verbose': 0},
 'SGDClassifier': {'model': SGDClassifier(random_state=42),
  'alpha': 0.0001,
  'average': False,
  'class_weight': None,
  'early_stopping': False,
  'epsilon': 0.1,
  'eta0': 0.0,
  'fit_intercept': 

### GridSearchCV

#### Applying MultinomialNB()

In [93]:
#model
model1 = MultinomialNB()

In [94]:
Vectorize = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')
X_train = Vectorize.fit_transform(X_train)
X_test = Vectorize.transform(X_test)

In [95]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=42)

In [96]:
best_params[alg[1].__class__.__name__]

{'model': MultinomialNB(),
 'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True}

In [97]:
alpha = list(np.linspace(0.1,0.02,4))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model1,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [98]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [99]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.9996969686252897
Test score: 0.9995455174533855


MultinomialNB(alpha=0.04666666666666667)

#### Applying RidgeRegression()

In [100]:
#model
model2 = RidgeClassifier()

In [101]:
best_params[alg[4].__class__.__name__]

{'model': RidgeClassifier(random_state=42),
 'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'normalize': 'deprecated',
 'positive': False,
 'random_state': 42,
 'solver': 'auto',
 'tol': 0.001}

In [102]:
alpha = list(np.linspace(0.15,0.4, 5))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model2,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [103]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [104]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.9994949477404799
Test score: 0.9991666977456909


RidgeClassifier(alpha=0.2125)

In [105]:
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline


In [106]:
# X = data['cleaned']
# y = data['lang_id']

# lab_enc = LabelEncoder()
# y = lab_enc.fit_transform(y)

In [107]:
X_train , X_test , y_train , y_test = train_test_split(X, y,  stratify=y, test_size=0.4, random_state =1)

In [108]:
vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(2, 6), analyzer= 'char')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [109]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2)]
final_est = RidgeClassifier(alpha=0.2125)

In [110]:
stacking_NB2 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)


In [111]:
stacking_NB2.fit(X_train , y_train)

StackingClassifier(estimators=[('multiNB1', MultinomialNB(alpha=0.1)),
                               ('multiNB2', MultinomialNB(alpha=0.1))],
                   final_estimator=RidgeClassifier(alpha=0.2125),
                   passthrough=True)

In [112]:
pred = stacking_NB2.predict(X_test)


In [113]:
model_stats = {}
model_stats[stacking_NB2.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
StackingClassifier,0.999773,0.999773,0.999773


In [114]:
count_vec = CountVectorizer(ngram_range=(3,7), analyzer= 'char')
X_train , X_test , y_train , y_test = train_test_split(X, y, stratify=y,test_size=0.05, random_state =1)
X_train = count_vec.fit_transform(X_train)
X_test = count_vec.transform(X_test)

In [115]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)
multiNB3 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2), ('multiNB3', multiNB3)]
final_est = RidgeClassifier(alpha=0.2125)

In [116]:
stacking_NB3 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)

In [None]:
stacking_NB3.fit(X_train , y_train)

In [None]:
pred = stacking_NB3.predict(X_test)

In [None]:
model_stats = {}
model_stats[stacking_NB3.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

### Kaggle Submission

In [None]:
X = test['cleaned']
Vectorize = vect.transform(X)

In [None]:
test['lang_id'] = stacking_NB2.predict(Vectorize)

In [None]:
submission = test[['index', 'lang_id']]
submission.to_csv('Submission.csv',index=False)
submission

In [None]:
test.cleaned

In [None]:
#Reference
#-Sello monabala-kaggle