# Importing the libraries

In [292]:
import os
import nltk
import nltk.corpus
import re
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
import string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer

# Importing the datasets

In [293]:
train = pd.read_csv("train_set.csv")
test = pd.read_csv('test_set.csv')

In [294]:
train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [295]:
test.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


In [296]:
train.lang_id.value_counts()

nso    3000
ven    3000
sot    3000
ssw    3000
nbl    3000
tsn    3000
afr    3000
eng    3000
xho    3000
tso    3000
zul    3000
Name: lang_id, dtype: int64

# DATA PROCESSING

In [297]:
train.isnull().sum()

lang_id    0
text       0
dtype: int64

In [298]:
train.dropna(inplace=True)

In [299]:
for char in string.punctuation:
    print(char, end=" ")
translate_table = dict((ord(char), None) for char in string.punctuation)

! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~ 

In [None]:
#loading the en_core_web_sm_model
stopwords = STOP_WORDS
nlp = spacy.load('en_core_web_sm')


def preprocess(train):
    #creating a Doc object
    doc = nlp(train, disable = ['ner', 'parser'])
    #Generating lemmas
    lemmas = [token.lemma_ for token in doc]
    #remove stopwords and non-alphabetic characters
    a_lemma = [lemma for lemma in lemmas
              if lemma.isalpha() and lemma not in stopwords ]
    return ' ' .join(a_lemma)

#apply preprocessing to posts
train['text']= train['text'].apply(preprocess)

In [None]:
y = train['lang_id']
X = train['text']

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

## Splitting the datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=0)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
test.head()

## Loading in the ML models

### Multinomial Naive Bayes

In [None]:
MNB = MultinomialNB()

In [None]:
MNB.fit(X_train, y_train)
MNB_pred = MNB.predict(X_test)

In [None]:
f1_score(y_test, MNB_pred, average="macro")

In [None]:
testx = test['text']
test_vect = vectorizer.transform(testx)

In [None]:
y_pred = MNB.predict(test_vect)

In [None]:
test['lang_id'] = y_pred

In [None]:
test.head()

In [None]:
#params = {'C': [0.1, 0.5, 1, 5, 10]}
#MNB = GridSearchCV(MultinomialNB(max_iter=2000, multi_class='ovr'),
                   #param_grid=params,
                   #scoring=make_scorer(f1_score, average='macro'))

In [None]:
#MNB = MNB.fit(X_train, y_train)

In [None]:
#MNB = MultinomialNB(random_state=0,
                #C=MNB.params['C'])
#MNB.fit(X_train, y_train)
#y_pred = MNB.predict(X_val)

#MNB_tuned = MultinomialNB(random_state=random_state)
#MNB_tuned.fit(X_train, y_train)
#y_pred_tuned = MNB_tuned.predict(X_test)

In [None]:
test[['index','lang_id']].to_csv('MNB2.csv', index=False)

In [None]:
### Linear Support Vector Machine

In [None]:
LSVC = LinearSVC(random_state=0)
LSVC.fit(X_train, y_train)
LSVC_pred = LSVC.predict(X_test)


In [None]:
f1_score(y_test,LSVC_pred, average="macro")

In [None]:
testx = test['text']
test_vect = vectorizer.transform(testx)

In [None]:
y_pred = LSVC.predict(test_vect)

In [None]:
test['lang_id'] = y_pred

In [None]:
test.head()

In [291]:
#params = {'C': [0.1, 0.5, 1, 5, 10]}
#LSVC = GridSearchCV(LinearSVC(max_iter=4000, multi_class='ovr'),
                   #param_grid=params,
                   #scoring=make_scorer(f1_score, average='macro'))


IndentationError: unexpected indent (<ipython-input-291-3be0c8e2f725>, line 3)

In [None]:
#LSVC = LSVC.fit(X_train, y_train)

In [None]:
#svc = LinearSVC(random_state=0,
                C=clf.['C'])
#svc.fit(X_train, y_train)
#y_pred = svc.predict(X_val)

#svc_tuned = LinearSVC(random_state=random_state)
#svc_tuned.fit(X_train, y_train)
#y_pred_tuned = svc_tuned.predict(X_test)

In [None]:

test[['index','lang_id']].to_csv('LSVC3.csv', index=False)

In [None]:
### Gradient boost classifier took to long to load

In [None]:
#GBC = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)

#GBC.fit(X_train, y_train)

In [None]:
#pred_GBC = GBC.predict(X_test)

In [None]:
#test['lang_id'] = y_pred

In [None]:
#test.head()