In [1]:
!pip install sentence_transformers

In [2]:
from termcolor import colored
from sentence_transformers import SentenceTransformer, models, util
import torch

<h3>SentenceTransformers Multilingual-Models </h3>
<p> Ref- https://www.sbert.net/examples/training/multilingual/README.html#multilingual-models </p>

<h4> Performance </h4>

  <table><tr>
    <th>Model</th>
    <th>AR-AR</th>
    <th>AR-EN</th>
    <th>ES-ES</th>
    <th>ES-EN</th>
    <th>EN-EN</th>
    <th>TR-EN</th>
    <th>EN-DE</th>
    <th>FR-EN</th>
    <th>IT-EN</th>
    <th>NL-EN</th>
    <th>Average</th>
  </tr>
  <tr>
    <td>XLM-RoBERTa mean pooling </td>
    <td align="center">25.7</td>
    <td align="center">17.4</td>
    <td align="center">51.8</td>
    <td align="center">10.9</td>
    <td align="center">50.7</td>
    <td align="center">9.2</td>
    <td align="center">21.3</td>
    <td align="center">16.6</td>
    <td align="center">22.9</td>
    <td align="center">26.0</td>
    <td align="center">25.2</td>
  </tr>
  <tr>
    <td>mBERT mean pooling </td>
    <td align="center">50.9</td>
    <td align="center">16.7</td>
    <td align="center">56.7</td>
    <td align="center">21.5</td>
    <td align="center">54.4</td>
    <td align="center">16.0</td>
    <td align="center">33.9</td>
    <td align="center">33.0</td>
    <td align="center">34.0</td>
    <td align="center">35.6</td>
    <td align="center">35.3</td>
  </tr>
  <tr>
    <td>LASER</td>
    <td align="center">68.9</td>
    <td align="center">66.5</td>
    <td align="center">79.7</td>
    <td align="center">57.9</td>
    <td align="center">77.6</td>
    <td align="center">72.0</td>
    <td align="center">64.2</td>
    <td align="center">69.1</td>
    <td align="center">70.8</td>
    <td align="center">68.5</td>
    <td align="center">69.5</td>
  </tr> 
  <tr>
    <td colspan="12"><b>Sentence Transformer Models</b></td>
  </tr>
  <tr>
  <td>distiluse-base-multilingual-cased</td>
    <td align="center">75.9</td>
    <td align="center">77.6</td>
    <td align="center">85.3</td>
    <td align="center">78.7</td>
    <td align="center">85.4</td>
    <td align="center">75.5</td>
    <td align="center">80.3</td>
    <td align="center">80.2</td>
    <td align="center">80.5</td>
    <td align="center">81.7</td>
    <td align="center">80.1</td>
    </tr>
</table>


<h4> Usage </h4>

```
from sentence_transformers import SentenceTransformer
model_name = "distiluse-base-multilingual-cased"
embedder = SentenceTransformer(model_name)
embeddings = embedder.encode(['Hello World', 'Hallo Welt', 'Hola mundo'])
print(embeddings)
```



<h3> Load multilingual BERT model in SentenceTransformer </h3>

```
# from sentence_transformers import models
from sentence_transformers import SentenceTransformer, models

#Multilingual Model
model_name = 'bert-base-multilingual-uncased'

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
```



## Test SentenceTransformer Multilingual Model

In [3]:
from sentence_transformers import SentenceTransformer
model_name = 'distiluse-base-multilingual-cased'
embedder = SentenceTransformer(model_name)
# embeddings = embedder.encode(['list of sentences'])
# print(embeddings)

In [22]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.',
          'apple',
          'mango',
          'understanding',
          'एक आदमी खाना खा रहा है।',
          'एक आदमी रोटी का टुकड़ा खा रहा है।',
          'लड़की एक बच्चे को ले जा रही है।',
          'एक आदमी घोड़े की सवारी कर रहा है।',
          'एक महिला वायलिन बजा रही है।',
          'दो आदमियों ने गाड़ियों को जंगल में धकेला।',
          'एक आदमी सफेद घोड़े पर एक बंद जमीन पर सवार है।',
          'एक बंदर ढोल बजा रहा है।',
          'एक चीता अपने शिकार के पीछे भाग रहा है।',
          'सेब',
          'आम',
          'समझ'
           ]

In [6]:
#encode the corpus or create the sentence embeddings
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [23]:
#Test Data
queries = ['एक आदमी खाना खा रहा है।',
          'एक आदमी रोटी का टुकड़ा खा रहा है।',
          'लड़की एक बच्चे को ले जा रही है।',
          'एक आदमी घोड़े की सवारी कर रहा है।',
          'एक महिला वायलिन बजा रही है।',
          'दो आदमियों ने गाड़ियों को जंगल में धकेला।',
          'एक आदमी सफेद घोड़े पर एक बंद जमीन पर सवार है।',
          'एक बंदर ढोल बजा रहा है।',
          'एक बंदर गाना गा रहा है।',
          'एक बंदर खाना खा रहा है।',
          'एक चीता अपने शिकार के पीछे भाग रहा है।',
          'सेब',
          'आम',
          'समझ',
           "Is monkey an animal?",
           ]

In [9]:
top_k = min(5, len(corpus))
for i, query in enumerate(queries):
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n======================")
    print(colored(f"{i}. Query: {query}", 'red'))
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(colored(f"{corpus[idx]} Score: {score})", 'green'))


[31m0. Query: एक आदमी खाना खा रहा है।[0m

Top 5 most similar sentences in corpus:
[32mएक आदमी खाना खा रहा है। Score: 0.9999997019767761)[0m
[32mA man is eating food. Score: 0.9736140966415405)[0m
[32mA man is eating a piece of bread. Score: 0.7029650211334229)[0m
[32mएक आदमी रोटी का टुकड़ा खा रहा है। Score: 0.6472431421279907)[0m
[32mA man is riding a horse. Score: 0.37016773223876953)[0m

[31m1. Query: एक आदमी रोटी का टुकड़ा खा रहा है।[0m

Top 5 most similar sentences in corpus:
[32mएक आदमी रोटी का टुकड़ा खा रहा है। Score: 0.9999998807907104)[0m
[32mA man is eating a piece of bread. Score: 0.9618503451347351)[0m
[32mएक आदमी खाना खा रहा है। Score: 0.6472431421279907)[0m
[32mA man is eating food. Score: 0.5999749898910522)[0m
[32mA monkey is playing drums. Score: 0.3435114026069641)[0m

[31m2. Query: लड़की एक बच्चे को ले जा रही है।[0m

Top 5 most similar sentences in corpus:
[32mलड़की एक बच्चे को ले जा रही है। Score: 0.9999996423721313)[0m
[32mThe girl is c

In [16]:
gujrati = ['એક માણસ ખોરાક ખાઈ રહ્યો છે.',
           'એક માણસ બ્રેડનો ટુકડો ખાઈ રહ્યો છે.',
           'છોકરી બાળકને લઈ જઈ રહી છે.',
           'એક માણસ ઘોડા પર સવાર છે.',
           'એક સ્ત્રી વાયોલિન વગાડી રહી છે.',
           'બે માણસોએ ગાડીઓને જંગલમાં ધકેલી દીધી.',
           'એક માણસ બંધ જમીન પર સફેદ ઘોડા પર સવાર છે.',
           'એક વાંદરો ઢોલ વગાડી રહ્યો છે.',
           'ચિત્તો તેના શિકારની પાછળ દોડે છે.',
           'સફરજન',
           'કેરી',
           'સમજવુ']

eng =    ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.',
          'apple',
          'mango',
          'understanding']

In [15]:
top_k = min(5, len(corpus))
for i, query in enumerate(gujrati):
    query_embedding = embedder.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n======================")
    print(colored(f"{i}. Query: {query}", 'red'))
    print(colored(f"{i}. Translation of query: {eng[i]}", 'red'))
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(colored(f"{corpus[idx]} Score: {score})", 'green'))


[31m0. Query: એક માણસ ખોરાક ખાઈ રહ્યો છે.[0m
[31m0. Translation of query: A man is eating food.[0m

Top 5 most similar sentences in corpus:
[32mएक आदमी खाना खा रहा है। Score: 0.8792285919189453)[0m
[32mA man is eating food. Score: 0.8619061708450317)[0m
[32mA man is eating a piece of bread. Score: 0.6823359727859497)[0m
[32mएक आदमी रोटी का टुकड़ा खा रहा है। Score: 0.6788792610168457)[0m
[32mएक आदमी घोड़े की सवारी कर रहा है। Score: 0.32669737935066223)[0m

[31m1. Query: એક માણસ બ્રેડનો ટુકડો ખાઈ રહ્યો છે.[0m
[31m1. Translation of query: A man is eating a piece of bread.[0m

Top 5 most similar sentences in corpus:
[32mएक आदमी रोटी का टुकड़ा खा रहा है। Score: 0.5257009863853455)[0m
[32mA man is eating a piece of bread. Score: 0.47255370020866394)[0m
[32mएक बंदर ढोल बजा रहा है। Score: 0.37435203790664673)[0m
[32mएक आदमी सफेद घोड़े पर एक बंद जमीन पर सवार है। Score: 0.37368011474609375)[0m
[32mएक आदमी खाना खा रहा है। Score: 0.36608418822288513)[0m

[31m2. Query:

## Test Multilingual BERT Model

In [17]:
# from sentence_transformers import models
from sentence_transformers import SentenceTransformer, models

# Multilingual Model
model_name = 'bert-base-multilingual-uncased'

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
corpus_embeddings_ = model.encode(corpus, convert_to_tensor=True)

In [24]:
top_k = min(5, len(corpus))
for i, query in enumerate(queries):
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings_)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n======================")
    print(colored(f"{i}. Query: {query}", 'red'))
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(colored(f"{corpus[idx]} Score: {score})", 'green'))


[31m0. Query: एक आदमी खाना खा रहा है।[0m

Top 5 most similar sentences in corpus:
[32mएक आदमी खाना खा रहा है। Score: 1.0)[0m
[32mएक आदमी रोटी का टुकड़ा खा रहा है। Score: 0.8767616152763367)[0m
[32mएक आदमी घोड़े की सवारी कर रहा है। Score: 0.8630039095878601)[0m
[32mएक महिला वायलिन बजा रही है। Score: 0.8284913897514343)[0m
[32mएक बंदर ढोल बजा रहा है। Score: 0.8233563899993896)[0m

[31m1. Query: एक आदमी रोटी का टुकड़ा खा रहा है।[0m

Top 5 most similar sentences in corpus:
[32mएक आदमी रोटी का टुकड़ा खा रहा है। Score: 1.0000008344650269)[0m
[32mएक आदमी घोड़े की सवारी कर रहा है। Score: 0.9081702828407288)[0m
[32mएक चीता अपने शिकार के पीछे भाग रहा है। Score: 0.8989201188087463)[0m
[32mएक आदमी सफेद घोड़े पर एक बंद जमीन पर सवार है। Score: 0.8853155374526978)[0m
[32mएक आदमी खाना खा रहा है। Score: 0.8767614364624023)[0m

[31m2. Query: लड़की एक बच्चे को ले जा रही है।[0m

Top 5 most similar sentences in corpus:
[32mलड़की एक बच्चे को ले जा रही है। Score: 0.999999701976776

 *The issue with multilingual BERT (mBERT) as well as with XLM-RoBERTa is that those produce rather bad sentence representation out-of-the-box. Further, the vectors spaces between languages are not aligned, i.e., the sentences with the same content in different languages would be mapped to different locations in the vector space.*