In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import re
import string

In [5]:
df = pd.read_csv('Language Detection.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10337 entries, 0 to 10336
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      10337 non-null  object
 1   Language  10337 non-null  object
dtypes: object(2)
memory usage: 161.6+ KB


In [6]:
df['Language'].value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [7]:
list_language = df['Language'].unique()

In [8]:
%matplotlib inline
# jumlah stiap jenis hate speech
English = len(df[df['Language'] == 'English'])
Malayalam = len(df[df['Language'] == 'Malayalam'])
Hindi = len(df[df['Language'] == 'Hindi'])
Tamil = len(df[df['Language'] == 'Tamil'])
Portugeese = len(df[df['Language'] == 'Portugeese'])
French = len(df[df['Language'] == 'French'])
Dutch = len(df[df['Language'] == 'Dutch'])
Spanish = len(df[df['Language'] == 'Spanish'])
Greek = len(df[df['Language'] == 'Greek'])
Russian= len(df[df['Language'] == 'Russian'])
Danish = len(df[df['Language'] == 'Danish'])
Italian = len(df[df['Language'] == 'Italian'])
Turkish = len(df[df['Language'] == 'Turkish'])
Sweedish = len(df[df['Language'] == 'Sweedish'])
Arabic = len(df[df['Language'] == 'Arabic'])
German = len(df[df['Language'] == 'German'])
Kannada = len(df[df['Language'] == 'Kannada'])

jenis = ['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada']
count = [English, Malayalam, Hindi, Tamil, Portugeese, French,
       Dutch, Spanish, Greek, Russian, Danish, Italian,
       Turkish, Sweedish, Arabic, German, Kannada]

fig = px.pie(names = jenis,
             values = count, 
             title = 'Jumlah bahasa dalam data sets',
             color_discrete_sequence = px.colors.sequential.Rainbow)
fig.update_traces(textinfo='percent+label')
fig.show()

In [9]:
# splitting
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Language'], test_size=0.2, random_state=266)

X_train_df = pd.DataFrame(X_train)
y_train_df = pd.DataFrame(y_train)

In [10]:
# create dataframe
def frame(input):
    return pd.DataFrame(input)

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class proces_case_folding(BaseEstimator, TransformerMixin):
    
    def __init__(self, create = True):
        self.create = create
        
    def fit(self, X, y=None):
        return self
    
    def process(self, X):
        text = X.lower()
        text = re.sub(f'\d','', text)
        text = text.translate(str.maketrans(" "," ", string.punctuation))
        text = text.strip()
        return text
    
    def transform(self, X):
        X['Case Folding'] = X['Text'].apply(self.process)
        return X['Case Folding']

In [12]:
case_folding = proces_case_folding()
df_Train = case_folding.fit_transform(X_train_df)

In [13]:
# vektorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

# pipeline X
pipeline_X = Pipeline([
    ('case_folding', proces_case_folding()),
    ('cv', CountVectorizer())
])

# pipline y
pipeline_y = Pipeline([
    ('le', LabelEncoder())
])

In [14]:
class ModifiedLabelEncoder(LabelEncoder):

    def fit_transform(self, y, *args, **kwargs):
        return super().fit_transform(y).reshape(-1, 1)

    def transform(self, y, *args, **kwargs):
        return super().transform(y).reshape(-1, 1)
    
    def inverse_transform(self, y):
        return super().inverse_transform(y)

In [15]:
# X
prepared_x = pipeline_X.fit_transform(X_train_df)
print(f'shape Train : {prepared_x.shape}')

#l
#le = LabelEncoder()
encode = ModifiedLabelEncoder()
prepared_y = encode.fit_transform(y_train_df)
print(f'shape y Train : {prepared_y.shape}')

shape Train : (8269, 35075)
shape y Train : (8269, 1)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [16]:
prepared_y.shape

(8269, 1)

## modeling

In [17]:
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [18]:
modelNB = MultinomialNB()
modelNB.fit(prepared_x, prepared_y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



MultinomialNB()

In [19]:
pred_train = modelNB.predict(prepared_x)
print(f'Classification report\n{classification_report(prepared_y, pred_train)}')

Classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       440
           1       1.00      0.98      0.99       342
           2       1.00      1.00      1.00       429
           3       0.96      1.00      0.98      1095
           4       1.00      1.00      1.00       823
           5       1.00      1.00      1.00       384
           6       1.00      0.99      0.99       297
           7       1.00      0.96      0.98        49
           8       1.00      1.00      1.00       554
           9       1.00      0.97      0.99       284
          10       1.00      0.99      1.00       478
          11       1.00      1.00      1.00       606
          12       1.00      0.99      1.00       550
          13       1.00      0.99      1.00       634
          14       0.99      1.00      1.00       566
          15       1.00      0.99      1.00       364
          16       1.00      0.97      0.98       374

    

In [20]:
def cross_val(model, data, label):
    cross_v = cross_val_score(model, data, label, cv=5, scoring="accuracy")
    
    print('Accurasy - All - Cross Val : ', cross_v)
    print('Accurasy - Mean - Cross Val : ', cross_v.mean())
    print('Accurasy - std - Cross Val : ', cross_v.std())
    
cross_val_logic = cross_val(modelNB, prepared_x, prepared_y)

Accurasy - All - Cross Val :  [0.97218863 0.98125756 0.97521161 0.97762999 0.97943134]
Accurasy - Mean - Cross Val :  0.977143824829137
Accurasy - std - Cross Val :  0.003183684759718408



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



### model inferense

In [21]:
df_test = frame(X_test)
label_test = frame(y_test)

In [22]:
prepared_x_test = pipeline_X.transform(df_test)
prepared_y_test = encode.transform(label_test)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [23]:
# prediction 
y_pred = modelNB.predict(prepared_x_test)
# model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix
ac = accuracy_score(prepared_y_test, y_pred)
cm = confusion_matrix(prepared_y_test, y_pred)

print(f'accuracy : {ac}\nconfusion_matrix : {cm}')

accuracy : 0.980174081237911
confusion_matrix : [[ 95   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0  81   0   2   0   0   0   0   0   0   0   0   0   0   3   0   0]
 [  0   0 112   5   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0 289   1   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2 189   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   2   0  84   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0  67   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0  14   0   0   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0 143   0   0   0   0   0   0   0   0]
 [  0   0   0   6   0   0   0   0   0  79   0   0   0   0   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0 115   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   1   0   0 131   0   1   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0 142   0   0   0   0]
 [  0   0   0   1   0   0   0

In [24]:
# full pipeline
full_pipeline = Pipeline([
    ('pipe_prepros', pipeline_X),
    ('models', modelNB)
])

full_pipeline.fit(X_train_df, prepared_y)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Pipeline(steps=[('pipe_prepros',
                 Pipeline(steps=[('case_folding', proces_case_folding()),
                                 ('cv', CountVectorizer())])),
                ('models', MultinomialNB())])

In [25]:
label_pipe = Pipeline([
    ('label', encode)
])

label_pipe.fit_transform(y_train_df)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



array([[16],
       [ 3],
       [11],
       ...,
       [16],
       [ 4],
       [ 8]])

In [26]:
# opsi satu fungsi
def all_input(input_str):
    
    list_pred_language = np.array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'])
    
    out = {'Text' : [input_str]}
    out = pd.DataFrame(data = out)
    
    pred_test = full_pipeline.predict(out)
    #print(pred_test)
    
    if pred_test == [3]:
        # model cyber
        print(f'{list_pred_language[pred_test[0]]} Language')
        
    else:
        print(f'sorry this is {list_pred_language[pred_test[0]]}, else Engish Language')

In [27]:
predict_text = input('input text : ')
all_input(predict_text)

English Language


In [65]:
#opsi 2 fungsi menggunakan pipline mixin label

def all_input2(input_str):
    
    out = {'Text' : [input_str]}
    out = pd.DataFrame(data = out)
    
    pred_test = full_pipeline.predict(out)
    print(pred_test)
    lang = encode.inverse_transform(pred_test)
    
    if lang[0] == 'English':
        # model cyber
        print(f'{lang[0]} Language')
        
    else:
        print(f'sorry this is {lang[0]}, else Engish Language')

In [66]:
predict_text = input('input text : ')
all_input2(predict_text)

[3]
English Language


In [68]:
mystring = 'helloworld'
mystring['stringindex']

TypeError: string indices must be integers

In [30]:
# save model
#import joblib
#model_filter = full_pipeline

#joblib.dump(model_filter, "model_filter_language.pkl")

In [31]:
import joblib

In [32]:
pipeline_label = label_pipe

joblib.dump(pipeline_label, "pipeline_label.pkl")

['pipeline_label.pkl']

In [33]:
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf


text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
max_features = 5000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.

# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

2022-03-07 02:55:34.347417: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-03-07 02:55:34.347449: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (Arsy-Laptop): /proc/driver/nvidia/version does not exist
2022-03-07 02:55:34.348027: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [34]:
text_dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [35]:
vectorize_layer.adapt(text_dataset.batch(64))

In [36]:
vectorize_layer.get_weights()

[array([b'foo', b'baz', b'bar'], dtype=object)]

In [37]:
dir(vectorize_layer)

['_TF_MODULE_IGNORED_PROPERTIES',
 '__abstractmethods__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_activity_regularizer',
 '_adapt_function',
 '_adapt_maybe_build',
 '_add_trackable',
 '_add_trackable_child',
 '_add_variable_with_custom_getter',
 '_auto_track_sub_layers',
 '_autocast',
 '_autographed_call',
 '_batch_input_shape',
 '_build_input_shape',
 '_call_accepts_kwargs',
 '_call_arg_was_passed',
 '_call_fn_arg_defaults',
 '_call_fn_arg_positions',
 '_call_fn_args',
 '_call_full_argspec',
 '_callable_losses',
 '_cast_single_input',
 '_checkpoint_dependencies',
 '_clear_losses',
 '_compute_dtype'

In [56]:
from sklearn.base import BaseEstimator, TransformerMixin
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


class vector_1(BaseEstimator, TransformerMixin):
    
    def __init__(self,X = None):
        self.X = X,
        
        
    def tokenizer(self, X):
        features = 10000
        tokenizer_0 = Tokenizer(num_words = features)
        tokenizer_0.fit_on_texts(X['Text'].tolist())
        return tokenizer_0
    
    def token_trans(self, X):
        token = self.tokenizer(self, X)
        return token.texts_to_sequance(self.text.tolist())

        
    def fit(self, X):
        return self.tokenizer(X)
        
    def transform(self, text):
        trans = self.tokenizer(self.X)
        #print(trans)
        #txt_inp = trans.texts_to_sequences(text)
        #return pad_sequences(txt_inp)

In [57]:
vetor = vector_1()

In [59]:
word = vetor.fit(X_train_df)

In [62]:
len(word.word_index)

41786

In [63]:
vetor.transform(list('hallo i am okey'))

TypeError: tuple indices must be integers or slices, not str