In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import warnings
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('english-train.csv', encoding="utf-8")

In [3]:
def count_length():
    df['word_count'] = df['headline'].apply(lambda x: len(str(x).split(" ")))

In [4]:
count_length()

In [5]:
df.head(10)

Unnamed: 0,headline,Label,word_count
0,Former New Zealand cricketer Chris Cairns diag...,Sports,10
1,American skater Nathan Chen dazzles in his Oly...,Sports,9
2,La Liga: Enes Unal scores brace to lead Getafe...,Sports,13
3,U-19 World Cup: Australia beat Afghanistan to ...,Sports,10
4,ICC U-19 World Cup India vs England final Live...,Sports,15
5,Brainsqueeze: Know your Snow,Sports,4
6,Justin Langer steps down as Australia coach,Sports,7
7,From Ratnakar Shetty’s memoirs: Apparently Vir...,Sports,16
8,I think ODI revolution happened with 1996 Worl...,Sports,15
9,U-19 World Cup: Boxer’s son Nishant Sindhu wit...,Sports,11


In [6]:
df['label_id'] = df.Label.factorize()[0]

def preprocess_text(sen,flg_lemm=True, lst_stopwords=stopwords.words('english')):
    # Lowercase
    sentence = sen.lower()
    
    # Remove punctuations and numbers
    sentence = re.sub('[0-9]', ' ', sentence)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\b[a-zA-Z]\b", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    lst_text = sentence.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)

    return text
 

In [7]:
df['headline'] = df.headline.apply(lambda x: preprocess_text(x))
count_length()

In [8]:
df.head(10)

Unnamed: 0,headline,Label,word_count,label_id
0,former new zealand cricketer chris cairn diagn...,Sports,9,0
1,american skater nathan chen dazzle olympic return,Sports,7,0
2,la liga ene unal score brace lead getafe win l...,Sports,10,0
3,world cup australia beat afghanistan claim rd ...,Sports,8,0
4,icc world cup india v england final live strea...,Sports,10,0
5,brainsqueeze know snow,Sports,3,0
6,justin langer step australia coach,Sports,5,0
7,ratnakar shetty memoir apparently virat unhapp...,Sports,9,0
8,think odi revolution happened world cup tendul...,Sports,10,0
9,world cup boxer son nishant sindhu deadly left,Sports,8,0


In [9]:
# set parameters:
max_features = 75000
maxlen = 24
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 5

In [10]:
X = df['headline'].values
y = df['label_id']
y = to_categorical(y, num_classes=5)
y.shape

(37395, 5)

In [11]:
print('Loading data...')
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
#x_train = tokenizer.texts_to_matrix(x_train, mode='tfidf')
x_test  = tokenizer.texts_to_sequences(x_test)
#x_test = tokenizer.texts_to_matrix(x_test, mode='tfidf')

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test  =  pad_sequences(x_test, padding='post', maxlen=maxlen)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')

x_train = pad_sequences(x_train, maxlen=maxlen)
x_test  =  pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
29916 train sequences
7479 test sequences
Pad sequences (samples x time)
x_train shape: (29916, 24)
x_test shape: (7479, 24)


In [12]:
test_classes = []
for i in range(len(y_test)):
    for j in range(len(y_test[i])):
        if y_test[i][j] == 1:
            if j==0:
                test_classes.append(0)
            if j==1:
                test_classes.append(1)
            if j==2:
                test_classes.append(2)
            if j==3:
                test_classes.append(3)
            if j==4:
                test_classes.append(4)

In [13]:
import json
tokenizer_json = tokenizer.to_json()
with open('eng_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=True))

In [14]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(5))
model.add(Activation('sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 50)            3750000   
                                                                 
 dropout (Dropout)           (None, 24, 50)            0         
                                                                 
 conv1d (Conv1D)             (None, 22, 250)           37750     
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout_1 (Dropout)         (None, 250)               0         
                                         

In [15]:
history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
loss, Accuracy = model.evaluate(x_test, y_test)

print("Test Loss:", loss)
print("Test Accuracy:", round(Accuracy*100))

Test Loss: 0.3025665581226349
Test Accuracy: 94


In [18]:
model.save('Eng_CNN.h5')
model.save_weights('Eng_CNN_weights.h5')

In [19]:
predicted_categories = []
preds = model.predict(x_test)
for i in preds:
    pred_classes = np.argsort(i)[-1:][::-1]
    predicted_categories.append(pred_classes[0])

In [20]:
report = classification_report(test_classes, predicted_categories)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1473
           1       0.90      0.93      0.92      1423
           2       0.93      0.92      0.92      1548
           3       0.97      0.94      0.95      1570
           4       0.95      0.95      0.95      1465

    accuracy                           0.94      7479
   macro avg       0.94      0.94      0.94      7479
weighted avg       0.94      0.94      0.94      7479



In [21]:
mat = confusion_matrix(predicted_categories, test_classes)
mat

array([[1392,   18,   15,   27,   28],
       [  25, 1327,   81,   12,   32],
       [   8,   53, 1423,   42,    7],
       [  11,    8,   24, 1471,    9],
       [  37,   17,    5,   18, 1389]], dtype=int64)

In [22]:
import json
from keras_preprocessing.text import tokenizer_from_json

def load():
    model = load_model('Eng_CNN.h5')
    maxlen = 400
    id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

    with open('eng_tokenizer.json') as f:
        data = json.load(f)
        tokenizer = tokenizer_from_json(data)
load()

In [28]:
# Here's how to generate a prediction on individual examples
#text_labels = encoder.classes_ 
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

my_input =["Strandja Memorial Boxing: Nandini ends with bronze after semifinal loss"]

input_sequences = tokenizer.texts_to_sequences(my_input)
input_pad = pad_sequences(input_sequences, padding='post', maxlen=maxlen)

preds = model.predict(input_pad)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result[c] = round(p*100,2)
result

{'sports': 100.0}

In [29]:
# Here's how to generate a prediction on individual examples
#text_labels = encoder.classes_ 
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

my_input2 =["Big hit on India: ‘Higher oil, food prices; duty rollback may help"]

input_sequences = tokenizer.texts_to_sequences(my_input2)
input_pad = pad_sequences(input_sequences, padding='post', maxlen=maxlen)

preds = model.predict(input_pad)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result2 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result2[c] = round(p*100,2)
result2

{'business': 100.0}

In [30]:
# Here's how to generate a prediction on individual examples
#text_labels = encoder.classes_ 
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

my_input3 =["Anushka Sharma dresses daughter Vamika as a fairy for Halloween, Soha Ali Khan turns Inaaya into a unicorn"]

input_sequences = tokenizer.texts_to_sequences(my_input3)
input_pad = pad_sequences(input_sequences, padding='post', maxlen=maxlen)

preds = model.predict(input_pad)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result3 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result3[c] = round(p*100,2)
result3

{'entertainment': 100.0}

In [31]:
# Here's how to generate a prediction on individual examples
#text_labels = encoder.classes_ 
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

my_input4 =["Apple is expected to launch the iPhone SE 3 later this year."]

input_sequences = tokenizer.texts_to_sequences(my_input4)
input_pad = pad_sequences(input_sequences, padding='post', maxlen=maxlen)

preds = model.predict(input_pad)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result4 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result4[c] = round(p*100,2)
result4

{'tech': 100.0}

In [32]:
# Here's how to generate a prediction on individual examples
#text_labels = encoder.classes_ 
id_to_category = {0:'sports', 1:'tech', 2:'business', 3:'politics',
                  4:'entertainment'}

my_input5 =["Jammu and Kashmir Delimitation Commission Rejects Some Submissions by BJP, NC Members"]
input_sequences = tokenizer.texts_to_sequences(my_input5)
input_pad = pad_sequences(input_sequences, padding='post', maxlen=maxlen)

preds = model.predict(input_pad)[0]

pred_classes = np.argsort(preds)[-1:][::-1]

classes = [id_to_category[i] for i in pred_classes]
props   = preds[pred_classes]

result5 = {}
for c, p in zip(classes, props):
    #result.append("{} {:.2f} %".format(c,p*100))
    result5[c] = round(p*100,2)
result5

{'politics': 100.0}