# Imports

In [58]:
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split
import random
from utils import *
from classifiers import *
from preprocess import  preprocess

import matplotlib.pyplot as plt


In [59]:
seed = 42
random.seed(seed)
analyzer = 'char'
raw = pd.read_csv('C:/Users/jordi/Documents/GitHub/MUD_Labs_Git/LangDetect/data/dataset.csv')
voc_size = 1

# Split

In [93]:
# list with all the languages (unique)
languages = set(raw['language']) 

# Split Train and Test sets
X=raw['Text']
y=raw['language']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


In [94]:
x_test_raw_2 = X_test

In [61]:
print(len(X_train), len(X_test))

17600 4400


In [62]:
# raw['language'].value_counts().plot(kind='pie', autopct='%1.0f%%', figsize=(10, 10))

# Procesar words

In [63]:
# Preprocess text (Word granularity only)
if analyzer == 'word':
    X_train, y_train = preprocess(X_train,y_train)
    X_test, y_test = preprocess(X_test,y_test)

# Procesar features

In [64]:
unigramVectorizer = CountVectorizer(analyzer='char',max_features=500,ngram_range=(1,1))
X_unigram_train_raw = unigramVectorizer.fit_transform(X_train)
X_unigram_test_raw = unigramVectorizer.transform(X_test)
unigramFeatures = unigramVectorizer.get_feature_names()
print(len(unigramFeatures))

500




In [40]:
# for i in X_unigram_train_raw.toarray():
#     print(i)

In [65]:
features = unigramFeatures 
X_train_raw = X_unigram_train_raw
X_test_raw = X_unigram_test_raw

In [66]:
# Aggregate Unigrams per language
def train_lang_dict(X_raw_counts, y_train):
    lang_dict = {}
    for i in range(len(y_train)):
        lang = y_train[i]
        v = np.array(X_raw_counts[i])
        if not lang in lang_dict:
            lang_dict[lang] = v
        else:
            lang_dict[lang] += v
            
    # to relative
    for lang in lang_dict:
        v = lang_dict[lang]
        lang_dict[lang] = v / np.sum(v)
        
    return lang_dict

language_dict_unigram = train_lang_dict(X_unigram_train_raw.toarray(), y_train.values)

# Collect relevant chars per language
def getRelevantCharsPerLanguage(features, language_dict, significance=1e-5):
    relevantCharsPerLanguage = {}
    relevantCharsPerLanguage_w_significance = {}

    for lang in languages:
        chars = []
        significance_chars = {}
        relevantCharsPerLanguage[lang] = chars
        relevantCharsPerLanguage_w_significance[lang] = significance_chars
        v = language_dict[lang]
        for i in range(len(v)):
            if v[i] > significance:
                chars.append(features[i])
                significance_chars[features[i]] = v[i]
    return relevantCharsPerLanguage, relevantCharsPerLanguage_w_significance

relevantCharsPerLanguage, relevantCharsPerLanguage_significance = getRelevantCharsPerLanguage(unigramFeatures, language_dict_unigram)

# Print number of unigrams per language
languages_num_ngrams = {}
for lang in languages:    
    languages_num_ngrams[lang] = len(relevantCharsPerLanguage[lang])


# Order a dictionary by value
def sortDictByValue(d):
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}

languages_num_ngrams = sortDictByValue(languages_num_ngrams)

languages_num_ngrams
# plot an histogram with the number of unigrams per language
# plt.bar(languages_num_ngrams.keys(), languages_num_ngrams.values(), color='g')
# plt.show()

{'Dutch': 46,
 'English': 47,
 'Spanish': 53,
 'Portugese': 55,
 'French': 55,
 'Indonesian': 64,
 'Swedish': 66,
 'Russian': 72,
 'Estonian': 77,
 'Turkish': 80,
 'Arabic': 83,
 'Romanian': 84,
 'Persian': 86,
 'Latin': 86,
 'Tamil': 88,
 'Hindi': 90,
 'Urdu': 104,
 'Thai': 117,
 'Pushto': 125,
 'Chinese': 185,
 'Japanese': 203,
 'Korean': 217}

In [54]:
# Plots de significancia de los unigramas por idioma
for i in relevantCharsPerLanguage_significance.keys():
    relevantCharsPerLanguage_significance[i] = {k: v for k, v in sorted(relevantCharsPerLanguage_significance[i].items(), key=lambda item: item[1])}


# for i in relevantCharsPerLanguage_significance.keys():
#     # add title
#     plt.xlabel("Unigrams")
#     plt.ylabel("Significance")
#     plt.title(i)
#     plt.plot(relevantCharsPerLanguage_significance[i].keys(), relevantCharsPerLanguage_significance[i].values(), color='b')
#     plt.savefig('C:/Users/jordi/Documents/GitHub/MUD_Labs_Git/LangDetect/figures/'+i+'.jpg')
#     # plt.show()
#     # save figure


In [67]:
print('========')
print('Number of tokens in the vocabulary:', len(features))
print('Coverage: ', compute_coverage(features, X_test.values, analyzer=analyzer))
print('========')

Number of tokens in the vocabulary: 500
Coverage:  0.960360956129256


In [30]:
# Uni- & Bi-Gram Mixture CountVectorizer for top 1% features
from sklearn.feature_extraction.text import CountVectorizer

top1PrecentMixtureVectorizer = CountVectorizer(analyzer='char', ngram_range=(1,1), min_df=1e-2)
X_top1Percent_train_raw = top1PrecentMixtureVectorizer.fit_transform(X_train)
X_top1Percent_test_raw = top1PrecentMixtureVectorizer.transform(X_test)

language_dict_top1Percent = train_lang_dict(X_top1Percent_train_raw.toarray(), y_train.values)

top1PercentFeatures = top1PrecentMixtureVectorizer.get_feature_names()
print('Length of features', len(top1PercentFeatures))
print('')

#Unique features per language
relevantChars_Top1Percent = getRelevantCharsPerLanguage(top1PercentFeatures, language_dict_top1Percent, 1e-5)
for lang in relevantChars_Top1Percent:
    print("{}: {}".format(lang, len(relevantChars_Top1Percent[lang])))

Length of features 3079



TypeError: tuple indices must be integers or slices, not dict

# Classifier

In [68]:
#Apply Classifier  
X_train, X_test = normalizeData(X_train_raw, X_test_raw)
y_predict = applyNaiveBayes(X_train, y_train, X_test)

print('========')
print('Prediction Results:')    
plot_F_Scores(y_test, y_predict)
print('========')

Prediction Results:
F1: 0.9602272727272727 (micro), 0.9622050216794633 (macro), 0.9620524366626872 (weighted)


In [81]:
y_test.iloc[0]

'Japanese'

In [74]:
y_predict[0]

'Japanese'

In [95]:
for i in range(len(y_predict)):
    if y_test.iloc[i] != y_predict[i]:
        print('Predicted: ', y_predict[i], ' Actual: ', y_test.iloc[i], ' Text: ', x_test_raw_2.iloc[i])
        print('')

Predicted:  English  Actual:  Urdu  Text:  below was the list of the types of sports played in the seag from  the bullet mark • indicates that the sport was played in the respective year

Predicted:  English  Actual:  Spanish  Text:  en febrero de  fue traspasado a los milwaukee bucks junto con ish smith y jj redick a cambio de beno udrih doron lamb y tobias harris[]​ en julio de  los milwaukee bucks no hacen efectiva la extensión de su contrato y decide acordar jugar en los atlanta hawks

Predicted:  English  Actual:  Portugese  Text:  stephen midgley peter stevens ben richardson paul gioia  nicholas lander worldwidewattle - webseite über die akazien mit einem schwerpunkt auf die australischen arten

Predicted:  English  Actual:  Spanish  Text:  como actriz ha trabajado para productoras como evil angel girlfriends films wicked brazzers sweetheart video naughty america mile high pure play media forbidden fruits films mile high digital sin o hard x

Predicted:  English  Actual:  Thai  T

In [None]:
plot_Confusion_Matrix(y_test, y_predict, "Greens") 

In [None]:
#Plot PCA
print('========')
print('PCA and Explained Variance:') 
plotPCA(X_train, X_test,y_test, languages) 
print('========')


In [151]:
unigrams_plot = {'150':0.937,
'250': 0.959,
'500': 0.96022,
'1000':0.95522,
'2000':0.94659,
'4000': 0.93409,
'6816': 0.9245}

In [2]:
unigrams_plot_coverage = {'150':0.8560,
'250': 0.9243,
'500': 0.9603,
'1000': 0.98085,
'2000':0.9928,
'4000': 0.9986,
'6816': 0.9997}

In [None]:
# plot an histogram with unigrams_plot with the y axis from 0.8 to 1
plt.bar(unigrams_plot_coverage.keys(), unigrams_plot_coverage.values(), color='b')
plt.ylim(0.8, 1)
plt.show()


In [None]:
# plot an histogram with unigrams_plot with the y axis from 0.8 to 1
plt.bar(unigrams_plot.keys(), unigrams_plot.values(), color='g')
plt.ylim(0.9, 1)
plt.show()
