In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
from torch import nn, optim

In [63]:
training_data = [
        ("Veinte paginas".lower().split(), "Spanish"),
        ("நீங்கள் ஆங்கிலம் பேசுகிறீர்களா?".split(),"தமிழ்"),
        ("உங்களை சந்தித்ததில் மகிழ்ச்சி".split(),"தமிழ்"),
        ("உங்கள் பெயர் என்ன?".split(),"தமிழ்"),
        ("காலை வணக்கம்!".split(),"தமிழ்"),
        ("அவள் என் அம்மா".split(),"தமிழ்"),
        ("I will visit the library".lower().split(), "English"),
        ("I am reading a book".lower().split(), "English"),
        ("This is my favourite chapter".lower().split(), "English"),
        ("Estoy en la biblioteca".lower().split(), "Spanish"),
        ("Tengo un libro".lower().split(), "Spanish")
        ]

test_data = [
        ("Estoy leyendo".lower().split(), "Spanish"),
         ("நீங்கள் எனக்கு உதவ முடியுமா?".split() , "தமிழ்"),
        ("This is not my favourite book".lower().split(), "English")
        ]

word_dict = {}
i = 0
for words, language in training_data + test_data:
    for word in words:
        if word not in word_dict:
            word_dict[word] = i
            i += 1
print(word_dict)

{'veinte': 0, 'paginas': 1, 'நீங்கள்': 2, 'ஆங்கிலம்': 3, 'பேசுகிறீர்களா?': 4, 'உங்களை': 5, 'சந்தித்ததில்': 6, 'மகிழ்ச்சி': 7, 'உங்கள்': 8, 'பெயர்': 9, 'என்ன?': 10, 'காலை': 11, 'வணக்கம்!': 12, 'அவள்': 13, 'என்': 14, 'அம்மா': 15, 'i': 16, 'will': 17, 'visit': 18, 'the': 19, 'library': 20, 'am': 21, 'reading': 22, 'a': 23, 'book': 24, 'this': 25, 'is': 26, 'my': 27, 'favourite': 28, 'chapter': 29, 'estoy': 30, 'en': 31, 'la': 32, 'biblioteca': 33, 'tengo': 34, 'un': 35, 'libro': 36, 'leyendo': 37, 'எனக்கு': 38, 'உதவ': 39, 'முடியுமா?': 40, 'not': 41}


In [64]:
corpus_size = len(word_dict)
languages = 3
label_index = {"Spanish": 0, "English": 1 , "தமிழ்" : 2}

In [65]:
class BagofWordsClassifier(nn.Module):  

    def __init__(self, languages, corpus_size):
        super(BagofWordsClassifier, self).__init__()
        self.linear = nn.Linear(corpus_size, languages)

    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [66]:
def make_bow_vector(sentence, word_index):
    word_vec = torch.zeros(corpus_size)
    for word in sentence:
        word_vec[word_dict[word]] += 1
    return word_vec.view(1, -1)

def make_target(label, label_index):
    return torch.LongTensor([label_index[label]])

In [82]:
languages

3

In [68]:
model = BagofWordsClassifier(languages, corpus_size)

In [69]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [70]:
for epoch in range(100):
    for sentence, label in training_data:

        model.zero_grad()

        bow_vec = make_bow_vector(sentence, word_dict)
        target = make_target(label, label_index)


        log_probs = model(bow_vec)

        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        
    if epoch % 10 == 0:
        print('Epoch: ',str(epoch+1),', Loss: ' + str(loss.item()))

Epoch:  1 , Loss: 1.292186975479126
Epoch:  11 , Loss: 0.2877197563648224
Epoch:  21 , Loss: 0.1350707858800888
Epoch:  31 , Loss: 0.08645842969417572
Epoch:  41 , Loss: 0.06326258927583694
Epoch:  51 , Loss: 0.049787912517786026
Epoch:  61 , Loss: 0.04100893437862396
Epoch:  71 , Loss: 0.034844618290662766
Epoch:  81 , Loss: 0.03028198517858982
Epoch:  91 , Loss: 0.026770485565066338


In [91]:
def make_predictions(data):

    with torch.no_grad():
        sentence = data
        #label = data[1]
        print("sentence", sentence)
        bow_vec = make_bow_vector(sentence, word_dict)
        log_probs = model(bow_vec)
        print(sentence)
       
        print(np.exp(log_probs))
        a = ((np.argmax(np.exp(log_probs))).item())
        print("Predicted Language is " , list(label_index.keys())[a])
        
        
make_predictions(test_data[0][0])
make_predictions(test_data[1][0])

sentence ['estoy', 'leyendo']
['estoy', 'leyendo']
tensor([[0.5259, 0.0814, 0.3927]])
Predicted Language is  Spanish
sentence ['நீங்கள்', 'எனக்கு', 'உதவ', 'முடியுமா?']
['நீங்கள்', 'எனக்கு', 'உதவ', 'முடியுமா?']
tensor([[0.1250, 0.0386, 0.8364]])
Predicted Language is  தமிழ்


In [92]:
make_predictions(['நீங்கள்', 'எனக்கு','அம்மா'])

sentence ['நீங்கள்', 'எனக்கு', 'அம்மா']
['நீங்கள்', 'எனக்கு', 'அம்மா']
tensor([[0.0419, 0.0174, 0.9408]])
Predicted Language is  தமிழ்
