### Importing Libraries

In [17]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups

### Question 1: Add a column at the end of the English and Italian data frames named ‘LANGUAGE’ which identifies the language of the ‘TEXT’ column. Create a single data frame by concatenating both the English and Italian data frames into one.

In [18]:
english = pd.read_csv("CONcreTEXT_trial_EN.tsv", sep='\t')
italian = pd.read_csv("CONcreTEXT_trial_IT.tsv", sep='\t')
english["LANGUAGE"] = "ENGLISH"
italian["LANGUAGE"] = "ITALIAN"
combined = pd.concat([english, italian])
print(combined)

         TARGET POS  INDEX                                               TEXT  \
0   achievement   N      3  Bring up academic achievements , awards , and ...   
1   achievement   N      9  Please list people you have helped , your pers...   
2      activate   V      1     Add activated carbon straight to your vodka .    
3      activate   V     15  Place sensors around your garden , and when a ...   
4     adventure   N      9  Look for a partner that shares your level of a...   
..          ...  ..    ...                                                ...   
95       verità   N      8  In un modo o nell' altro , la verità viene sem...   
96      viaggio   N      2  Organizza dei viaggi nel fine settimana quando...   
97      viaggio   N      6  Pesa le tue valigie prima del viaggio per evit...   
98        vista   N      6  è molto importante non perdere di vista la pro...   
99        vista   N      9  i conigli hanno un ottimo udito e un' ottima v...   

    MEAN LANGUAGE  
0   3.0

### Question 2: Using the sklearn classes CountVectorizer and TfidfTransformer create a training set using *all rows* of the ‘TEXT’ column from your merged dataframe.

In [19]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(combined["TEXT"])

In [20]:
X_train_counts.shape

(200, 1330)

In [21]:
count_vect.vocabulary_.get('vista')

1285

In [22]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(200, 1330)

### Question 3: Train and fit a Multinomial Naive Bayes algorithm on the training data. The target variable is the ‘LANGUAGE’ column in your merged dataframe. 

In [23]:
clf = MultinomialNB().fit(X_train_tfidf, combined["LANGUAGE"])

### Question 4: Use your model to predict the language of the following two sentences: docs_new = ['Why does a rose smell sweet?', 'Pensa ai tuoi sentimenti di amore.']

In [24]:
docs_new = ['Why does a rose smell sweet?', 'Pensa ai tuoi sentimenti di amore.', 'Brang ip']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print(doc + " => " + category)

Why does a rose smell sweet? => ENGLISH
Pensa ai tuoi sentimenti di amore. => ITALIAN
Brang ip => ENGLISH


### Question 5: Test your model on at least 5 other sentences of your own choosing (both English and Italian) and report your results. 

In [25]:
docs_new_test = ['not be ruled by fear', 'winning half the battle',
                 'nail on the head', 'hang in there', 'on the ball']
X_new_counts_test = count_vect.transform(docs_new_test)
X_new_tfidf_test = tfidf_transformer.transform(X_new_counts_test)
predicted_test = clf.predict(X_new_tfidf_test)

for doc, category in zip(docs_new_test, predicted_test):
    print(doc + " => " + category)

not be ruled by fear => ENGLISH
winning half the battle => ENGLISH
nail on the head => ENGLISH
hang in there => ENGLISH
on the ball => ENGLISH


In [26]:
docs_new_italian_test = ['C’è un treno alle', 'È stato un piacere conoscerla',
                 'tutta la colpa e', 'casa illegale in', 'amore e gentilezza']
X_new_counts_italian_test = count_vect.transform(docs_new_italian_test)
X_new_tfidf_italian_test = tfidf_transformer.transform(X_new_counts_italian_test)
predicted__italian_test = clf.predict(X_new_tfidf_italian_test)

for doc, category in zip(docs_new_italian_test, predicted__italian_test):
    print(doc + " => " + category)

C’è un treno alle => ITALIAN
È stato un piacere conoscerla => ITALIAN
tutta la colpa e => ITALIAN
casa illegale in => ITALIAN
amore e gentilezza => ITALIAN


### Extra Credit: Bonus points for testing the model on a sentence of your own that the model predicts incorrectly. For example, input an English sentence but the model predicts Italian. Or vice versa.

In [27]:
bonus_test = ["a meeting in scenario club"]
X_new_counts_test1 = count_vect.transform(bonus_test)
X_new_tfidf_test1 = tfidf_transformer.transform(X_new_counts_test1)
predicted_test1 = clf.predict(X_new_tfidf_test1)

for doc, category in zip(bonus_test, predicted_test1):
    print(doc + " => " + category)

a meeting in scenario club => ITALIAN
