# Language Detection using NLP tools.

## 1> Langdetect
Requires large portions of text. It uses non-deterministic approach under the hood. That means you get different results for the same text sample.

In [17]:
# !pip install langdetect
# ! pip install textblob


example_tweet = '#Wisconsin podría ser el punto de inflexión en la carrera entre #Trump y #Biden https://t.co/WFf8A1hAn7'

# testing
example_tweet = '#Wisconsin podría ser el punto de inflexión en la carrera entre #Trump y #Biden https://t.co/WFf8A1hAn7'



from langdetect import detect
from textblob import TextBlob

# Example tweet
example_tweet = "Bonjour tout le monde"

# Detect language using langdetect
detected_language = detect(example_tweet)
print(f'Language of text { example_tweet} is: {detected_language}')





Language of text Bonjour tout le monde is: fr


## 2>Textblob

In [19]:
# # Proceed with TextBlob analysis
# from textblob import TextBlob
# b = TextBlob("bonjour")
# b.detect_language()

# Note: This solution requires internet access and Textblob is using Google Translate's language detector by calling the API.

## 3> Polyglot.
#### Able to detect texts with mixed languages.



In [25]:
# !pip install polyglot
# !apt-get install -y python3-icu
# !apt-get install -y libicu-dev
# !pip install pyicu
# !pip install pycld2

from polyglot.detect import Detector

mixed_text = u"""
China (simplified Chinese: 中国; traditional Chinese: 中國),
officially the People's Republic of China (PRC), is a sovereign state
located in East Asia.
"""
for language in Detector(mixed_text).languages:
        print(language)

name: English     code: en       confidence:  87.0 read bytes:  1154
name: Chinese     code: zh_Hant  confidence:   5.0 read bytes:  1755
name: un          code: un       confidence:   0.0 read bytes:     0


## 4>  chardet
#### Chardet has also a feature of detecting languages if there are character bytes in range (127-255]:

In [28]:
# !pip install chardet
import chardet
chardet.detect("Я люблю вкусные пампушки".encode('cp1251'))
{'encoding': 'windows-1251', 'confidence': 0.9637267119204621, 'language': 'Russian'}



{'encoding': 'windows-1251',
 'confidence': 0.9637267119204621,
 'language': 'Russian'}

## 5> guess_language
Can detect very short samples by using this spell checker with dictionaries.



In [29]:
!pip install guess_language-spirit

Collecting guess_language-spirit
  Downloading guess_language-spirit-0.5.3.tar.bz2 (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: guess_language-spirit
  Building wheel for guess_language-spirit (setup.py) ... [?25l[?25hdone
  Created wheel for guess_language-spirit: filename=guess_language_spirit-0.5.3-py3-none-any.whl size=121197 sha256=09293b91e46d81707c99f471b0c673e2d14279ce4f0aa15aa8b8f75dbb68c6d3
  Stored in directory: /root/.cache/pip/wheels/e5/32/34/62e25b4c55d2802bb7d6540aed1fe171722a4f3bd854986d89
Successfully built guess_language-spirit
Installing c

## 6> langid
langid.py provides both a module

In [30]:
! pip install langid

Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941172 sha256=e742ca9ba6ab66b6fe11f0c82d6900f66cffc40589d9267934300ae32aaea082
  Stored in directory: /root/.cache/pip/wheels/23/c8/c6/eed80894918490a175677414d40bd7c851413bbe03d4856c3c
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6


In [31]:
import langid
langid.classify("This is a test")
# ('en', -54.41310358047485)

('en', -54.41310358047485)

## 7> FastText
FastText is a text classifier, can be used to recognize 176 languages with a proper models for language classification.

https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz

In [34]:
# !pip install fasttext



In [39]:
import fasttext
import numpy as np

# Load the pre-trained language identification model
model = fasttext.load_model('lid.176.ftz')

# Predict the top 2 languages for the given text
text = "الشمس تشرق"
predictions = model.predict(text, k=2)

labels, probabilities = predictions
print(labels)          # Output: ('__label__ar', '__label__fa')
print(probabilities)   # Output: array([0.98124713, 0.01265871])

# Display the results in a more readable format
for label, probability in zip(labels, probabilities):
    language = label.replace('__label__', '')
    print(f"Language: {language}, Probability: {probability}")


('__label__ar', '__label__fa')
[0.98124713 0.01265871]
Language: ar, Probability: 0.9812471270561218
Language: fa, Probability: 0.012658712454140186




## 8> pyCLD3
pycld3 is a neural network model for language identification. This package contains the inference code and a trained model.

In [42]:
# Install cld3 if not already installed
!pip install pycld3

# Import the cld3 module
import cld3

# Example text in a different language
text = "影響包含對氣候的變化以及自然資源的枯竭程度"

# Detect the language of the text
result = cld3.get_language(text)

# Print the detected language and its probability
print(f"Language: {result.language}, Probability: {result.probability}")


Collecting pycld3
  Using cached pycld3-0.22.tar.gz (726 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycld3
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for pycld3 (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for pycld3[0m[31m
[0m[?25h  Running setup.py clean for pycld3
Failed to build pycld3
[31mERROR: Could not build wheels for pycld3, which is required to install pyproject.toml-based projects[0m[31m
[0m

ModuleNotFoundError: No module named 'cld3'