In [33]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [19]:
# Read input data from files
en_text = pd.read_csv('../data/CONcreTEXT_trial_EN.tsv',sep ='\t')
it_text = pd.read_csv('../data/CONcreTEXT_trial_IT.tsv',sep='\t')

In [20]:
#Add language column to dataframes
en_text['LANGUAGE'] ='ENGLISH'
it_text['LANGUAGE'] =  'ITALIAN'

In [59]:
# Question 1 Concatenate dataframes in English and Italian language
train_data = pd.concat([en_text, it_text], sort=False).reindex()
train_data

Unnamed: 0,TARGET,POS,INDEX,TEXT,MEAN,LANGUAGE
0,achievement,N,3,"Bring up academic achievements , awards , and ...",3.06,ENGLISH
1,achievement,N,9,"Please list people you have helped , your pers...",3.03,ENGLISH
2,activate,V,1,Add activated carbon straight to your vodka .,3.83,ENGLISH
3,activate,V,15,"Place sensors around your garden , and when a ...",5.51,ENGLISH
4,adventure,N,9,Look for a partner that shares your level of a...,2.03,ENGLISH
...,...,...,...,...,...,...
95,verità,N,8,"In un modo o nell' altro , la verità viene sem...",2.53,ITALIAN
96,viaggio,N,2,Organizza dei viaggi nel fine settimana quando...,5.03,ITALIAN
97,viaggio,N,6,Pesa le tue valigie prima del viaggio per evit...,4.84,ITALIAN
98,vista,N,6,è molto importante non perdere di vista la pro...,2.22,ITALIAN


In [60]:
# Question 2 Count vectorize - Builds a dictionary of features and transforms documents to feature vectors
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.TEXT)
X_train_counts.shape

(200, 1330)

In [61]:
X_train_counts

<200x1330 sparse matrix of type '<class 'numpy.int64'>'
	with 2355 stored elements in Compressed Sparse Row format>

In [62]:
count_vect.vocabulary_.get(u'algorithm')

In [63]:
# TfidfTransformer - term frequency times Inverse Document Frequency
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(200, 1330)

In [64]:
# Transform our count-matrix to a tf-idf representation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(200, 1330)

In [65]:
# Question 3 Train model to predict the language of a sentence
clf = MultinomialNB().fit(X_train_tfidf, train_data.LANGUAGE)

In [67]:
# Question 4 Predict sample sentence
docs_new = ['Why does a rose smell sweet?', 'Pensa ai tuoi sentimenti di amore.']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, category))

'Why does a rose smell sweet?' => ENGLISH
'Pensa ai tuoi sentimenti di amore.' => ITALIAN


In [68]:
# Question 5) Predict Additional sentences 

test_data = ["Ogni individuo ha diritto all'istruzione.", "Everyone has the right to education", "L'istruzione tecnica e professionale deve essere",
            "Everyone has the right freely to participate in the cultural life of the community","Ogni individuo ha diritto di prendere parte liberamente"]
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(test_data, predicted):
     print('%r => %s' % (doc, category))

"Ogni individuo ha diritto all'istruzione." => ITALIAN
'Everyone has the right to education' => ENGLISH
"L'istruzione tecnica e professionale deve essere" => ITALIAN
'Everyone has the right freely to participate in the cultural life of the community' => ENGLISH
'Ogni individuo ha diritto di prendere parte liberamente' => ITALIAN


In [78]:
# Bonus point :: Following sentence is in englisg language but the model predicst Italian
my_sentence = ['Mi chiamo Martin']

X_new_counts = count_vect.transform(my_sentence)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(my_sentence, predicted):
     print('%r => %s' % (doc, category))

'Mi chiamo Martin' => ENGLISH
