In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten, Dropout, Bidirectional, Input, GRU
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras import optimizers 
import re
import tensorflow as tf 
from nltk import word_tokenize
from google.colab import drive 
import os
from keras import layers

In [2]:
drive.mount('/content/drive')
df_path = "/content/drive/MyDrive/AIM_Task/preprocessed_data.csv"

Mounted at /content/drive


In [3]:
df = pd.read_csv(df_path)

In [4]:
df

Unnamed: 0,dialect,text
0,10,اللي حابين يشاركون بالمسابقة
1,10,هل تعلم أنك تشاهد خشمك طوال الوقت لكن عقلك تبر...
2,11,شو رح يقول الحريري بدي ك قلبو لسمير ججعج بس ارجع
3,13,صح صح اذكر كانو يعبون لنا بترول ببلاش
4,9,اساسا ماعندج القدره على الرد ثانيا شو انجازا...
...,...,...
458192,13,اللي يحط دعايات في اغاني اليهال ف يوتيوب الله ...
458193,17,قرف دى ملبن بالمكسرات
458194,10,حنيتو لشتاء ايه الله يعين اندفنا من الغبار و س...
458195,13,بتنتظر كثير لانهم ماهم بكفو يسونها السعايده ...


In [5]:
df_copy = df.copy()

In [6]:
X = df_copy['text'].values
X_test = X[:20000]
X_train = X[20000:]

y = pd.get_dummies(df_copy['dialect']).values
y_test = y[:20000]
y_train = y[20000:]

In [7]:
t = Tokenizer(oov_token='<UNK>')
t.fit_on_texts(X_train)
t.word_index['<PAD>'] = 0

In [8]:
train_sequences = t.texts_to_sequences(X_train)
test_sequences = t.texts_to_sequences(X_test)

In [9]:
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 1000

X_train = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
X_train.shape, X_test.shape

((438197, 1000), (20000, 1000))

In [10]:
print("Vocabulary size={}".format(len(t.word_index)))
print("Number of Documents={}".format(t.document_count))

Vocabulary size=461829
Number of Documents=438197


In [11]:
from sklearn.utils import class_weight 
class_weights = class_weight.compute_class_weight('balanced',
                                                   classes = np.unique(df_copy['dialect']),
                                                   y = df_copy['dialect'])

In [12]:
class_weights = dict(enumerate(class_weights))
class_weights

{0: 2.7531244742471217,
 1: 2.56425797208511,
 2: 2.2060307556017755,
 3: 1.76357135159269,
 4: 1.6426010769109434,
 5: 1.5729709503113694,
 6: 1.5672570427834558,
 7: 1.3316273743925973,
 8: 0.968180012509086,
 9: 0.9680327383970524,
 10: 0.9486951732591268,
 11: 0.9217289672625154,
 12: 0.9116933092972633,
 13: 0.8193179339176957,
 14: 0.6974270223537327,
 15: 0.604511835685694,
 16: 0.5819438729113641,
 17: 0.441657798752323}

In [13]:
EMBEDDING_DIM = 100
MAX_NB_WORDS = 10000
VOCAB_SIZE = len(t.word_index)
VOCAB_SIZE

461829

In [14]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=X_train.shape[1]))
# model.add(SpatialDropout1D(0.4))

# model.add(layers.Conv1D(128, 5, activation='relu'))
# model.add(layers.GlobalMaxPooling1D())

model.add(LSTM(units=128, return_sequences=True, return_state=False))
model.add(Dropout(0.2))

model.add(LSTM(units=64, return_sequences=True, return_state=False))
model.add(Dropout(0.2))

model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(18, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         46182900  
                                                                 
 lstm (LSTM)                 (None, 1000, 128)         117248    
                                                                 
 dropout (Dropout)           (None, 1000, 128)         0         
                                                                 
 lstm_1 (LSTM)               (None, 1000, 64)          49408     
                                                                 
 dropout_1 (Dropout)         (None, 1000, 64)          0         
                                                                 
 flatten (Flatten)           (None, 64000)             0         
                                                                 
 dense (Dense)               (None, 128)               8

In [16]:
batch_size = 128
history = model.fit(X_train, y_train, 
                    epochs=10, 
                    batch_size=batch_size, 
                    validation_data=(X_test,y_test), 
                    class_weight=class_weights,
                    verbose=1,
                    callbacks=[EarlyStopping(monitor='val_accuracy', mode='max', patience=3, min_delta=0.001)])

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
# best model is model4
import pickle
filename = 'DL_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [None]:
filename = 'tokenizer.pkl'
pickle.dump(t, open(filename, 'wb'))