In [25]:
import io
import json
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv("/kaggle/input/language-detection/Language Detection.csv")
df = df.sample(frac=1)

In [3]:
df.head()

Unnamed: 0,Text,Language
7708,sei libero sabato prossimo?,Italian
6111,Первые 5 разделов в это время имели следующий ...,Russian
3679,"À partir de l'été 2002, tous les sites sont pr...",French
1504,വിക്കിപീഡിയയിൽ രജിസ്റ്റർ ചെയ്യുന്നതിലൂടെയാണ് വ...,Malayalam
7993,"İngilizce konuşulan bir ülkeye, bunu tamamen k...",Turkish


In [4]:
df['Language'].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [5]:
df['Text Length'] = df['Text'].apply(len)

In [6]:
df['Text Length'].describe()

count    10337.000000
mean       124.055625
std        253.688194
min          1.000000
25%         43.000000
50%        100.000000
75%        166.000000
max      19088.000000
Name: Text Length, dtype: float64

In [7]:
df.sort_values(by='Text Length', ascending=False).head(20)

Unnamed: 0,Text,Language,Text Length
1997,तब से विकिमीडिया ने कई अन्य परियोजनाएं शुरू की...,Hindi,19088
1979,विकि-शब्दकोष (एक मुक्त शब्दकोष एवं समानांतर को...,Hindi,8752
2041,एक सेकंड पर लटकाओ। बस एक पल मेरे साथ सहन करो। ...,Hindi,8019
1984,वह टिप्पणी देते हैं कि नॉन-फिक्शन के कई पारंपर...,Hindi,4068
1988,के कोरियाई होस्टिंग सुविधा सियोल में थीं।[104]...,Hindi,3678
1996,"पहले से ही, राष्ट्रपति प्रविष्टियों को प्रतिदि...",Hindi,3093
2012,"नहीं, आपने एक अद्भुत काम किया है। यदि आप किसी ...",Hindi,2849
1981,बोट्स नामक कंप्यूटर प्रोग्राम के निर्माण के बा...,Hindi,2750
2005,अगर आप किसी से पूछते हैं कि आप रोने के लिए परे...,Hindi,2236
2889,[220][221][222] O MediaWiki tem documentação s...,Portugeese,1965


In [8]:
df.drop(index=df[df['Language'] == 'Hindi'].index, inplace=True)

In [9]:
train_ratio = 0.8
val_ratio = 0.2

train = df.iloc[:int(len(df) * 0.8)]
val = df.iloc[int(len(df) * 0.8):]

assert(len(train) + len(val) == len(df))
len(train), len(val)

(8219, 2055)

In [10]:
tokenizer = Tokenizer(oov_token='<UNK>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n0123456789')

In [11]:
train_sentences = list(train['Text'].values)
val_sentences = list(val['Text'].values)

In [12]:
tokenizer.fit_on_texts(train_sentences)

In [26]:
tokenizer_json = tokenizer.to_json()
with io.open('general_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [13]:
VOCAB_SIZE = len(tokenizer.word_index) + 1

In [14]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)

In [15]:
MAX_LENGTH = int(df['Text'].apply(lambda x: len(x.split())).quantile(.95))

In [16]:
train_inputs = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_inputs = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

In [17]:
languages = df['Language'].unique()
d = {lang: i for i, lang in enumerate(languages)}

In [18]:
train_integer_encoded = [d[lang] for lang in train['Language']]
train_outputs = tf.keras.utils.to_categorical(train_integer_encoded, num_classes=len(languages))
val_integer_encoded = [d[lang] for lang in val['Language']]
val_outputs = tf.keras.utils.to_categorical(val_integer_encoded, num_classes=len(languages))

In [19]:
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=64, input_length=MAX_LENGTH),
    Bidirectional(GRU(units=32, dropout=0.3)),
    Dense(16, activation='softmax')
])

In [20]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics='accuracy')

In [21]:
history = model.fit(train_inputs, 
                    train_outputs, 
                    validation_data=(val_inputs, val_outputs),
                    epochs=15,
                    batch_size=50)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [22]:
model.save('language-detection-model')
!zip -r ./language-detection-model.zip ./language-detection-model

  adding: language-detection-model/ (stored 0%)
  adding: language-detection-model/variables/ (stored 0%)
  adding: language-detection-model/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: language-detection-model/variables/variables.index (deflated 63%)
  adding: language-detection-model/saved_model.pb (deflated 90%)
  adding: language-detection-model/fingerprint.pb (stored 0%)
  adding: language-detection-model/assets/ (stored 0%)
  adding: language-detection-model/keras_metadata.pb (deflated 89%)
