First, let's import the necessary libraries and load the data:

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import save_model

2023-04-09 13:11:29.340473: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 13:11:30.593842: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/extras/CUPTI/lib64
2023-04-09 13:11:30.593969: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/l

Next, we need to preprocess the data by converting the text into numerical sequences that can be fed into the model. We'll use the Tokenizer class from Keras to do this:

In [2]:
# Load the dataset
data = pd.read_csv('Language Detection.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [4]:
data.describe()

Unnamed: 0,Text,Language
count,10337,10337
unique,10267,17
top,Jag är ledsen.,English
freq,3,1385


In [5]:
data['Text'].str.len().mean()

124.05562542323692

In [6]:
# Convert the text to numerical sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Text'])

X = tokenizer.texts_to_sequences(data['Text'])
X = pad_sequences(X, maxlen=100, padding='post')

We also need to one-hot encode the labels (which are currently in string format) to numerical values:

In [7]:
# One-hot encode the labels
labels = pd.get_dummies(data['Language'])

Now, we can split the data into training and testing sets:

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels['Italian'], test_size=0.2, random_state=42)

We'll use a simple LSTM model with an embedding layer and a few dense layers:

In [9]:
# Build the model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=100))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

2023-04-09 13:11:35.202365: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:42] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.


Finally, we can train the model and evaluate its performance:

In [10]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')

  return t[start:end]


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.11488939076662064, Test accuracy: 0.9550290107727051


In [11]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
#loss, accuracy = model.evaluate(X_test, y_test)
#print(f'Test loss: {loss}, Test accuracy: {accuracy}')

Epoch 1/10
  4/207 [..............................] - ETA: 4s - loss: 0.0752 - accuracy: 0.9609

  return t[start:end]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')

Epoch 1/10
  1/207 [..............................] - ETA: 8s - loss: 0.0365 - accuracy: 1.0000

  return t[start:end]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.10357607901096344, Test accuracy: 0.9705029129981995


We will now save the model for future inference use.

In [13]:
save_model(model, 'my_model.h5')

We have now loaded our model to test it.

In [14]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('my_model.h5')

If the probability of the prediction is higher than 0.5, we label the sentence as Italian. Otherwise, we label it as not Italian.

In [16]:
# Define a sentence to test
sentence = "Io sono una casalinga che lavora inoltre a casa come articolista. Mio marito è invece un operaio."
# Convert the sentence to a numerical sequence using the tokenizer
x_test = tokenizer.texts_to_sequences([sentence])

# Pad the sequence so it has the same length as the training sequences
x_test = pad_sequences(x_test, maxlen=100, padding='post')

# Make a prediction using the model
prediction = model.predict(x_test)

# Print the predicted value
print(prediction)

[[0.78383523]]
