In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
from random import shuffle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
import pickle

# model
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score
# import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report, confusion_matrix


#### main functions 

In [3]:
# remove some special characters
def remove_special_chars(sen, filter_chars):
    sen = sen.strip()
    sen = sen.lower()
    for each in sen:
        num_ascii = ord(each)
        # delete number, ".", "\", all chars in filter_chars
        if (num_ascii > 47 and num_ascii < 58) or num_ascii == 92 or num_ascii == 46 or (each in filter_chars):
            sen = sen.replace(each, "")
    return sen

In [4]:
# read file csv and convert it to pandasframe
def open_file(name):
    with open('database/{file_name}.csv'.format(file_name = "formatted_data")) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]

    data = []
    for num, each in enumerate(content):
        each = each.split(";")

        if "." in each[1]:
            sentences = each[1].split(".") 
            filter_chars = ['\t', '!', '"', '%', '&', '*', '+', ',', '-', '/', ':', '=', '?', '@', '[', ']', '§', 
                            '«', "”", "\\", ".", '»']
                    
            for number, sen in enumerate(sentences):
                """
                insert remove special characters

                """

                # filter no meaning words
                sen = remove_special_chars(sen, filter_chars)

                # make sure a sentence have len(sentence) > 0
                if len(sen)>0:
                    data.append([each[0], sen, each[2]])

        else:
            data.append(each)

    main_data = data[1:]
    main_data = shuffle(main_data)
    df = pd.DataFrame(main_data, columns = data[0])
    return df

In [5]:
df = open_file('formatted_data')

In [6]:
df.shape

(78160, 3)

In [7]:
df.head(10)

Unnamed: 0,language,text,length_text
0,et,istungi algusse abre la sesión a las,324119
1,cs,zasedání skončilo v schválení zápisu z předcho...,317927
2,da,jeg glemmer bestemt ikke den økonomiske side a...,678400
3,pt,se não fixarmos uma data específica para aplic...,730576
4,en,firstly let me make a statement of fact the fr...,690268
5,pl,horaszatwierdzenie protokołu z poprzedniego p...,317026
6,es,las detenciones las torturas las violaciones d...,733658
7,en,we are also currently examining how we can mos...,690268
8,en,we cannot continue with a situation where inco...,690268
9,el,επίτροπο να εργασθεί και στα οποία του ζητώ να...,523277


In [8]:
# get data in a row
def get_data(df = df, row = 60000):
    return  df.iloc[row][0], df.iloc[row][1]

label, text = get_data(row = 700)
print(label)
print(text)

da
effekten af projekter der er gennemført ved hjælp af unionsstøtte svækkes ofte ved en langsom beslutningstagning og kompliceret forvaltning


#### label encoding

In [9]:
# vectorize sentences and split it in to train and test file
def vectorization(df, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(list(df["text"]), list(df["language"]), test_size=test_size, random_state=42)

    # vectorize sentence X
    count_vectorizer = CountVectorizer(analyzer='char')
    X_train_features = count_vectorizer.fit_transform(X_train)
    X_test_features = count_vectorizer.transform(X_test)

    # vectorize label Y
    label_encoder = preprocessing.LabelEncoder()
    y_train_features = label_encoder.fit_transform(y_train)
    y_test_features = label_encoder.transform(y_test)
    
    # getted features
    features = count_vectorizer.get_feature_names()
    
    # getted labels
    labels = list(label_encoder.classes_)
    
    return X_train_features, y_train_features, X_test_features, y_test_features, features, labels, count_vectorizer

X_train_features, y_train_features, X_test_features, y_test_features, features, labels, count_vectorizer = vectorization(df)

In [10]:
# number of the features
print("features: ", features)
print("\nLen features: ", len(features))

features:  [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '\xad', '·', 'º', '¿', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ā', 'ă', 'ą', 'ć', 'č', 'ď', 'ē', 'ė', 'ę', 'ě', 'ģ', 'ī', 'į', 'ķ', 'ĺ', 'ļ', 'ľ', 'ł', 'ń', 'ņ', 'ň', 'ő', 'ŕ', 'ř', 'ś', 'ş', 'š', 'ţ', 'ť', 'ū', 'ů', 'ű', 'ų', 'ź', 'ż', 'ž', 'ș', 'ț', 'ΐ', 'ά', 'έ', 'ή', 'ί', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ϊ', 'ϋ', 'ό', 'ύ', 'ώ', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ь', 'ю', 'я', 'і', 'љ', 'ћ', '№']

Len features:  168


In [11]:
X_train_features.toarray()[0]

array([9, 8, 1, 4, 2, 9, 0, 3, 2, 7, 1, 5, 1, 2, 3, 8, 3, 0, 3, 6, 3, 0,
       0, 7, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

### build model

#### model MultinomialNB

In [11]:
# create and training model MultinomialNB
modelNB = MultinomialNB()
modelNB.fit(X_train_features, y_train_features)

# model accuracy, how often is the classifier correct?
print("Accuracy: ",f1_score(y_test_features, modelNB.predict(X_test_features), average='macro'))

Accuracy:  0.9273751238636891


In [12]:
# save the classifier
with open('models/modelNB.pkl', 'wb') as fid:
    pickle.dump(modelNB, fid) 
    
# load it again
with open('models/modelNB.pkl', 'rb') as fid:
    modelNB = pickle.load(fid)

In [13]:
# predict from saved model
modelNB.predict(X_test_features)

array([ 3,  9,  3, ...,  5, 12, 14])

#### model Random Forest

In [14]:
# model Random Forest

modelRF=RandomForestClassifier(n_estimators=100)
modelRF.fit(X_train_features,y_train_features)

y_pred = modelRF.predict(X_test_features)

# model accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test_features, y_pred))

Accuracy: 0.9414662231320369


In [15]:
# save the classifier
with open('models/modelRF.pkl', 'wb') as fid:
    pickle.dump(modelRF, fid) 
    
# load it again
with open('models/modelRF.pkl', 'rb') as fid:
    modelRF = pickle.load(fid)

In [16]:
# predict from saved model
modelRF.predict(X_test_features)

array([ 0,  9,  3, ...,  5, 12, 14])

###### optimize parametters for random forest model, accuracy can be better

In [17]:
# tuning model by grid search cv
from sklearn.model_selection import GridSearchCV

param_grid = { 
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), n_jobs = -1, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_features, y_train_features)

# choose the best parametters 
print("Optimal paras: ", CV_rfc.best_params_)

Optimal paras:  {'max_features': 'log2', 'n_estimators': 300}


In [20]:
# Create random forest model with the optimal parametter
optimal_modelRF=RandomForestClassifier(n_estimators=300, max_features= 'log2')

#Train the model using the training sets 
optimal_modelRF.fit(X_train_features,y_train_features)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
# find the accuracy
y_pred = optimal_modelRF.predict(X_test_features)

# model accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test_features, y_pred))

Accuracy: 0.9465839303991812


In [23]:
# save the classifier
with open('models/optimal_modelRF.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 
    
# load it again
with open('models/optimal_modelRF.pkl', 'rb') as fid:
    optimal_modelRF = pickle.load(fid)

#### model support vector machine - SVM

In [24]:
model_SVM = SVC(kernel='linear')
model_SVM.fit(X_train_features, y_train_features)

y_pred = model_SVM.predict(X_test_features)

print(confusion_matrix(y_test_features,y_pred))
print(classification_report(y_test_features,y_pred))

[[ 665    3    0    2    0    0    1    0    0    1    0    0    0    1
     0    0    0   13    0    0    0]
 [   7  478    1    3    0    2    7    0    0    1    1    0    2    1
     1    0    0    1   16    3    0]
 [   0    0 1038    0    0    2    0    0    0    1    0    0    0    4
     7    0    0    0    0    2    8]
 [   2    1   16  890    0    8    2    1    0    0    1    0    4    7
    12    1    0    2    3    1    1]
 [   2    0    1    1  590    0    0    0    0    0    0    0    2    0
     0    0    0    1    0    0    0]
 [   1    1   10   10    0  862    4    0    0    2    1    3    5    4
     7    0    0    1    3    0    0]
 [   1    1    2    0    0   11  878    0    0   12    1    9    5    3
     0    1   25    0    6    0    0]
 [   6    2    4    4    0    0    3  481   10    2    0    0    5    2
     1    0    0    5    0    1    4]
 [   0    0    1    1    0    0    0   14  867    0    0    0    1    0
     0    0    0    0    2    1    0]
 [   2    

#### find the best model

In [12]:
X, Y = list(df["text"]), list(df["language"])

# vectorize sentence X
vectorizer = CountVectorizer(analyzer='char')
X_features = count_vectorizer.fit_transform(X)

# vectorize label Y
label_encoder = preprocessing.LabelEncoder()
Y_features = label_encoder.fit_transform(Y)

###### To save time, We will find the best model from just 4 classifying algorithms RF, SVC, GaussianNB, LogisticRegression

In [13]:
models = [
    RandomForestClassifier(n_estimators=200),
    SVC(),
    GaussianNB(),
    LogisticRegression(),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

In [14]:
X_features.toarray()

array([[ 7,  5,  1, ...,  0,  0,  0],
       [62, 20,  1, ...,  0,  0,  0],
       [20,  5,  2, ...,  0,  0,  0],
       ...,
       [35, 20,  3, ...,  0,  0,  0],
       [32, 17,  1, ...,  0,  0,  0],
       [35, 20,  1, ...,  0,  0,  0]], dtype=int64)

In [15]:
Y_features

array([ 7,  1,  2, ...,  6,  6, 16])

In [16]:
len(X_features.toarray()[0])

169

In [17]:
len(X_features.toarray())

78160

In [18]:
len(Y_features)

78160

In [20]:
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, 
                                 X_features.toarray(), 
                                 Y_features, 
                                 scoring='accuracy', 
                                 cv=CV, 
                                 n_jobs=-1                                
                                 )
    print(accuracies)    
    entries.append([model_name, sum(accuracies)/len(accuracies)])

cv_df = pd.DataFrame(entries, columns=['model_name', 'accuracy'])

[0.94079284 0.9434638  0.9470957  0.9484226  0.937408  ]
[0.91432225 0.91385265 0.91613357 0.92116209 0.913344  ]
[0.81675192 0.82693784 0.82970829 0.81928713 0.81888   ]
[0.93842711 0.93789972 0.9392912  0.94176745 0.930752  ]


In [21]:
cv_df

Unnamed: 0,model_name,accuracy
0,RandomForestClassifier,0.943437
1,SVC,0.915763
2,GaussianNB,0.822313
3,LogisticRegression,0.937627


In [26]:
# testing
def test(test):
    test = [test]
    test_feature = count_vectorizer.transform(test)
    test_feature.shape
    
    global labels
    for num, each in enumerate(labels):
        if num == modelRF.predict(test_feature)[0]:
            return each

test('hello, my name is Giang, I come from Vietnam')

'en'