# South African Language Identification

## Imports

In [24]:
!pip install tweet-preprocessor

Collecting tweet-preprocessor
  Downloading tweet_preprocessor-0.6.0-py3-none-any.whl (27 kB)
Installing collected packages: tweet-preprocessor
Successfully installed tweet-preprocessor-0.6.0


In [25]:
import pandas as pd

#pandas and visualizations
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# text cleaning and transformation
import nltk
import re
import string
import preprocessor as p
import string

from collections import Counter
from nltk import bigrams
from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, sent_tokenize, FreqDist
from wordcloud import STOPWORDS
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer 
from wordcloud import WordCloud
# training the model
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# Asssessing the model
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
#resampling module
from sklearn.utils import resample
#hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

## Load Datasets

In [26]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')
df_sample = pd.read_csv('sample_submission.csv')

## Preprocessing

In [27]:
print('Shape of Train Dataset:', df_train.shape)

print('Shape of Train Dataset:', df_test.shape)

display(df_train.head())

display(df_test.head())

Shape of Train Dataset: (33000, 2)
Shape of Train Dataset: (5682, 2)


Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [28]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [29]:
df_train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [30]:
# Remove special characters
rnp = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")
rwp = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")

# Clean dataset
def clean_msg(df):
    Arr = []
    for msg in df:
        tempmsg = p.clean(msg)
        tempmsg = rnp.sub("", tempmsg.lower()) 
        tempmsg = rwp.sub(" ", tempmsg)
        Arr.append(tempmsg)
    return Arr

In [36]:
df_train_clean = clean_msg(df_train["text"])
df_train_clean = pd.DataFrame(df_train_clean)
df_train_clean


Unnamed: 0,0
0,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,i dha iya kuba nobulumko bokubeka umsebenzi na...
2,the province of kwazulu natal department of tr...
3,o netefata gore o ba file dilo ka moka te le d...
4,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...
32995,popo ya dipolateforomo tse ke go tlisa boetele...
32996,modise mosadi na o ntse o sa utlwe hore thaban...
32997,closing date for the submission of completed t...
32998,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [38]:
df_train["text"] = df_train_clean
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo siseko wenza amalungiselelo kumaziko ax...
1,xho,i dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu natal department of tr...
3,nso,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


## Models

In [39]:
#Splitting features and target variables
X = df_train['text']
y = df_train['lang_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [40]:
tfidf = TfidfVectorizer(ngram_range=(4,5),analyzer='char') 
cf= CountVectorizer() 

### Logistic Regression Model

In [47]:
lr = LogisticRegression(C=1, class_weight='balanced', max_iter=1000)

clf_lr = Pipeline([('tfidf', tfidf), ('clf', lr)]) 
clf_lr.fit(X_train, y_train) 
y_pred_lr= clf_lr.predict(X_test)

print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       586
         eng       1.00      1.00      1.00       594
         nbl       1.00      1.00      1.00       604
         nso       1.00      1.00      1.00       596
         sot       1.00      1.00      1.00       584
         ssw       1.00      1.00      1.00       628
         tsn       1.00      1.00      1.00       608
         tso       1.00      1.00      1.00       602
         ven       1.00      1.00      1.00       622
         xho       1.00      1.00      1.00       594
         zul       0.99      0.99      0.99       582

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



### Random Forest Classifier

In [44]:
rf = RandomForestClassifier(max_depth=5, n_estimators=100)
clf_rf = Pipeline([('tfidf', tfidf), ('clf', rf)])
clf_rf.fit(X_train, y_train)
y_pred_rf = clf_rf.predict(X_test)

print(classification_report(y_test, y_pred_rf)) 

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       586
         eng       0.98      1.00      0.99       594
         nbl       0.88      0.74      0.80       604
         nso       0.97      0.99      0.98       596
         sot       0.99      0.99      0.99       584
         ssw       0.99      0.96      0.98       628
         tsn       0.99      0.97      0.98       608
         tso       1.00      1.00      1.00       602
         ven       1.00      1.00      1.00       622
         xho       0.77      0.84      0.80       594
         zul       0.70      0.75      0.73       582

    accuracy                           0.93      6600
   macro avg       0.93      0.93      0.93      6600
weighted avg       0.93      0.93      0.93      6600



### KNN Classifier


In [49]:
knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
clf_knn = Pipeline([('tfidf', tfidf), ('clf', knn)])
clf_knn.fit(X_train, y_train)
y_pred_knn = clf_knn.predict(X_test)

print(classification_report(y_test, y_pred_knn)) 

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       586
         eng       0.99      1.00      1.00       594
         nbl       0.89      0.97      0.92       604
         nso       0.97      0.97      0.97       596
         sot       0.96      0.97      0.97       584
         ssw       0.97      0.97      0.97       628
         tsn       0.97      0.95      0.96       608
         tso       1.00      1.00      1.00       602
         ven       1.00      1.00      1.00       622
         xho       0.97      0.95      0.96       594
         zul       0.96      0.88      0.92       582

    accuracy                           0.97      6600
   macro avg       0.97      0.97      0.97      6600
weighted avg       0.97      0.97      0.97      6600



## Submission File

In [53]:
test_pred = clf_knn.predict(df_test['text'])

prediction = pd.DataFrame({'index':df_test['index'], 'lang_id':test_pred})

prediction.to_csv('Submission.csv', index=False)
