In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,recall_score,accuracy_score,precision_score
from mlxtend.plotting import plot_confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM,Dropout,GlobalAveragePooling1D,Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.utils import plot_model

In [2]:
from google.colab import drive

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
TRAIN="/content/drive/MyDrive/data/train.csv"
TEST="/content/drive/MyDrive/data/test.csv"

In [5]:
data=pd.read_csv(TRAIN,encoding= 'unicode_escape')
testdata=pd.read_csv(TEST,encoding= 'unicode_escape')
data.shape,testdata.shape

((1078, 3), (310, 3))

In [6]:
data.head()

Unnamed: 0,Class Index,Title,Description
0,4,Baalandoor: Liyooneel Mesiin badhaasa Taphataa...,Taphataan sarara fuulduraa Paariis Seent Jerme...
1,3,"'Seeqaan aangoo dabarsa, hoogganaan koo garuu ...",Pireezidantiin Keeniyaa waggoota sagal darbani...
2,2,Hospitaalli fardaa Bishooftuutti baname tajaaj...,Hospitaalli fardaa Itoophiyaatti kan jalqabaat...
3,4,Shamarreen Iraan dirree kubbaa miilaa seente j...,Deeggartuun kubbaa miilaa Iraan torbee dura ma...
4,3,Hoogganaan mormituu Raashiyaa ''suummeffame'' ...,Namni Puutiiniin mormuun beekaman kun akka dee...


In [7]:
data['Class Index'].value_counts()

3    350
2    338
4    280
1    110
Name: Class Index, dtype: int64

In [8]:
#Combining Title and Description --- >
X_train=data['Title']
y_train=data['Class Index'].apply(lambda x:x-1).values

In [11]:
testdata.head()

Unnamed: 0,Class Index,Title,Description
0,1,Biyoonseen walaloo muuziqaa haaraa ibiddarra i...,Weellistuun beekamtuu Ameerikaa Biyoonseen wal...
1,1,Hoomaan simbiraa xiyyaara oyiruu boqqoolloo ke...,Xiyyaarri imaltootaa Raashiyaa erga girrisa si...
2,1,Raapparii fi taatoo beekamaan DMX ganna 50tti ...,"Raapparii fi taatoo beekamaan US, DMX ganna 50..."
3,1,?Simbirroonni Masqalaa? yeroo hunda nu waliin ...,Simbirroonni Masqalaa ji'oota hedduuf erga bad...
4,1,"Artiist Hawwii Tazarraa: ""Ani Qaammeen kan Oro...",Godina Shawaa Lixaa aanaa Ada'aa Bargaa naanno...


In [10]:
X_test=testdata['Title']
y_test=testdata['Class Index'].apply(lambda x:x-1).values

In [11]:
#Max Lenghth Of Sentences in training data-
maxlen=X_train.map(lambda x:len(x.split())).max()

In [12]:
maxlen

18

In [13]:
vocab_size=12000
embed_size=32
#Creating and Fitting Tokenizers
tokenizer=Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train.values)

#Tokenize Data
X_train=tokenizer.texts_to_sequences(X_train)
X_test=tokenizer.texts_to_sequences(X_test)


#Pad Sequences to the max length
X_train=pad_sequences(X_train,maxlen=maxlen)
X_test=pad_sequences(X_test,maxlen=maxlen)

In [14]:
model=Sequential()
model.add(Embedding(vocab_size,embed_size,input_length=maxlen))
model.add(Bidirectional(LSTM(256,return_sequences=True)))
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(GlobalAveragePooling1D())
model.add(Dense(1024))
model.add(Dropout(0.25))
model.add(Dense(512))
model.add(Dropout(0.25))
model.add(Dense(256))
model.add(Dropout(0.25))
model.add(Dense(128))
model.add(Dropout(0.25))
model.add(Dense(64))
model.add(Dropout(0.25))
model.add(Dense(4,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 32)            384000    
                                                                 
 bidirectional (Bidirectiona  (None, 18, 512)          591872    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 18, 256)          656384    
 nal)                                                            
                                                                 
 global_average_pooling1d (G  (None, 256)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1024)              263168    
                                                        

In [15]:
callbacks=[EarlyStopping(
    monitor='val_accuracy',
    min_delta=1e-4,
    patience=4,
    verbose=1
),
ModelCheckpoint(filepath='weights.h5',
                monitor='val_accuracy',
                mode='max',
                save_best_only=True,
                save_weights_only=True,
                verbose=1)]

In [16]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [30]:
model.fit(X_train,y_train,batch_size=256,validation_data=(X_test,y_test),epochs=25,callbacks=callbacks)

Epoch 1/25
Epoch 1: val_accuracy did not improve from 0.68710
Epoch 2/25
Epoch 2: val_accuracy did not improve from 0.68710
Epoch 3/25
Epoch 3: val_accuracy did not improve from 0.68710
Epoch 4/25
Epoch 4: val_accuracy did not improve from 0.68710
Epoch 5/25
Epoch 5: val_accuracy did not improve from 0.68710
Epoch 6/25
Epoch 6: val_accuracy did not improve from 0.68710
Epoch 7/25
Epoch 7: val_accuracy did not improve from 0.68710
Epoch 8/25
Epoch 8: val_accuracy improved from 0.68710 to 0.69677, saving model to weights.h5
Epoch 9/25
Epoch 9: val_accuracy did not improve from 0.69677
Epoch 10/25
Epoch 10: val_accuracy did not improve from 0.69677
Epoch 11/25
Epoch 11: val_accuracy did not improve from 0.69677
Epoch 12/25
Epoch 12: val_accuracy did not improve from 0.69677
Epoch 12: early stopping


<keras.callbacks.History at 0x7f8260fd8c10>

In [31]:
model.load_weights('weights.h5')
model.save('model.hdf5')

In [32]:
def modelDemo(news_text):

  #News Labels
  labels = ['entertainment', 'health', 'politics', 'sports']

  test_seq = pad_sequences(tokenizer.texts_to_sequences(news_text), maxlen=maxlen)

  test_preds = [labels[np.argmax(i)] for i in model.predict(test_seq)]

  for news, label in zip(news_text, test_preds):
      # print('{} - {}'.format(news, label))
      print('{} - {}'.format(colored(news, 'green'), colored(label, 'red')))

In [33]:
from termcolor import colored

In [36]:
modelDemo(["Itoophiyaatti callaqeen geengoo aduutti marsee nama raaje 'Sun Halo' maali?"])

Itoophiyaatti callaqeen geengoo aduutti marsee nama raaje 'Sun Halo' maali? - health


In [37]:
testdata['Title'][6]

"Holqa 'Mana Waaqaa': Baaleetti holqi guddaan karra 47 qabu argame"

In [38]:
modelDemo(['Holqa Mana Waaqaa: Baaleetti holqi guddaan karra 47 qabu argame'])

Holqa Mana Waaqaa: Baaleetti holqi guddaan karra 47 qabu argame - entertainment


In [40]:
modelDemo(['Waraana Tigiraay: Magaalaa Dabre Taaboritti haleellaa meeshaa guddaan gaggeeffameen miseensa maatii tokkoo keessaa shan ajjeefaman'])

Waraana Tigiraay: Magaalaa Dabre Taaboritti haleellaa meeshaa guddaan gaggeeffameen miseensa maatii tokkoo keessaa shan ajjeefaman - politics


In [41]:
modelDemo(['US meeshaa ammayyaa haleellaa qilleensarraa ittisu Yukireeniif erguuf'])

US meeshaa ammayyaa haleellaa qilleensarraa ittisu Yukireeniif erguuf - politics


In [42]:
modelDemo(['Liivarpuul peenaalitiin 5-4 Cheelsii injifatee Suppar Kaappii fudhate'])

Liivarpuul peenaalitiin 5-4 Cheelsii injifatee Suppar Kaappii fudhate - sports


In [43]:
preds = [np.argmax(i) for i in model.predict(X_test)]



In [44]:
print("Recall of the model is {:.2f}".format(recall_score(y_test, preds, average='micro')))
print("Precision of the model is {:.2f}".format(precision_score(y_test, preds, average='micro')))
print("Accuracy of the model is {:.2f}".format(accuracy_score(y_test, preds)))   

Recall of the model is 0.70
Precision of the model is 0.70
Accuracy of the model is 0.70
