In [1]:
# https://www.kaggle.com/rohan9889/predict-news-category

In [1]:
import pandas as pd
import re
import spacy
import numpy as np

In [2]:
df = pd.read_excel("archive/Data_Train.xlsx")
df

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3
...,...,...
7623,"Karnataka has been a Congress bastion, but it ...",0
7624,"The film, which also features Janhvi Kapoor, w...",2
7625,The database has been created after bringing t...,1
7626,"The state, which has had an uneasy relationshi...",0


In [3]:
df_test = pd.read_excel("archive/Data_Test.xlsx")
df_test

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...
...,...
2743,"According to researchers, fraud in the mobile ..."
2744,The iPhone XS and XS Max share the Apple A12 c...
2745,"On the photography front, the Note 5 Pro featu..."
2746,UDAY mandated that discoms bring the gap betwe...


In [4]:
# Clean
all_stopwords = spacy.load('en_core_web_sm').Defaults.stop_words

In [5]:
def clean(text, stopwords):
    text = text.lower()
    
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")

    text = re.sub('\s+',' ',text)
    
    # Remove stop words
    text = " ".join([word for word in text.split(" ") if word not in stopwords])
    
    # Remove punctuation and all
    text = re.sub(r"[^\w\s]", "", text)
    
    return text

In [6]:
df.STORY = df.STORY.apply(lambda x : clean(x, all_stopwords))
df_test.STORY = df_test.STORY.apply(lambda x : clean(x, all_stopwords))
df

Unnamed: 0,STORY,SECTION
0,painful huge reversal fee income unheard priva...,3
1,formidable opposition alliance congress jharkh...,0
2,asian currencies trading lower today south kor...,3
3,want answer question click answer clicking ans...,1
4,global markets gold prices edged today disappo...,3
...,...,...
7623,karnataka congress bastion gave bjp government...,0
7624,film features janhvi kapoor revolve singing gh...,2
7625,database created bringing criminal records sta...,1
7626,state uneasy relationship mainland days late p...,0


In [7]:
# Lemmatize the the text
# Stemmer - reduces text to non dictionary form
# Lemmatizer - reduces text but to dictionary form

In [8]:
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    new_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(" ")])
    
    return new_text

In [9]:
df["LEMMATIZED_STORY"] = df.STORY.apply(lambda x : lemmatize(x))
df_test["LEMMATIZED_STORY"] = df_test.STORY.apply(lambda x : lemmatize(x))

df

Unnamed: 0,STORY,SECTION,LEMMATIZED_STORY
0,painful huge reversal fee income unheard priva...,3,painful huge reversal fee income unheard priva...
1,formidable opposition alliance congress jharkh...,0,formidable opposition alliance congress jharkh...
2,asian currencies trading lower today south kor...,3,asian currency trading lower today south korea...
3,want answer question click answer clicking ans...,1,want answer question click answer clicking ans...
4,global markets gold prices edged today disappo...,3,global market gold price edged today disappoin...
...,...,...,...
7623,karnataka congress bastion gave bjp government...,0,karnataka congress bastion gave bjp government...
7624,film features janhvi kapoor revolve singing gh...,2,film feature janhvi kapoor revolve singing gho...
7625,database created bringing criminal records sta...,1,database created bringing criminal record stat...
7626,state uneasy relationship mainland days late p...,0,state uneasy relationship mainland day late pr...


In [10]:
# Convert text to embeddings

In [11]:
from gensim.models import Word2Vec

In [12]:
all_sentences_list = [sentence.split(" ") for sentence in df.LEMMATIZED_STORY]
all_sentences_list.extend([sentence.split(" ") for sentence in df_test.LEMMATIZED_STORY])

In [13]:
len(all_sentences_list)

10376

In [14]:
# Remove any nans
# all_sentences_list = [x for x in all_sentences_list if str(x) != 'nan']

In [15]:
# Create Word2Vec for the same
wtov = Word2Vec(all_sentences_list, min_count=1)

In [16]:
vectors = wtov.wv.vectors

In [17]:
# vectors = vectors.tolist()
# vectors.append(vectors[-1])

In [18]:
wtov.__dict__

{'max_final_vocab': None,
 'callbacks': (),
 'load': <function gensim.utils.call_on_class_only(*args, **kwargs)>,
 'wv': <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f092e077810>,
 'vocabulary': <gensim.models.word2vec.Word2VecVocab at 0x7f092e077850>,
 'trainables': <gensim.models.word2vec.Word2VecTrainables at 0x7f092e0778d0>,
 'sg': 0,
 'alpha': 0.025,
 'window': 5,
 'random': RandomState(MT19937) at 0x7F0930A2B7C0,
 'min_alpha': 0.0001,
 'hs': 0,
 'negative': 5,
 'ns_exponent': 0.75,
 'cbow_mean': 1,
 'compute_loss': False,
 'running_training_loss': 0.0,
 'min_alpha_yet_reached': 0.00011535851966075483,
 'corpus_count': 10376,
 'corpus_total_words': 629198,
 'vector_size': 100,
 'workers': 3,
 'epochs': 5,
 'train_count': 1,
 'total_train_time': 4.273290828015888,
 'batch_words': 10000,
 'model_trimmed_post_training': False}

In [19]:
# Convert all the text in the sentences to tokens for feeding to vectors

In [20]:
from keras_preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence, text

In [21]:
len(list(df_test.LEMMATIZED_STORY))

2748

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_sentences_list)

In [23]:
df["TOKENS"] = tokenizer.texts_to_sequences(df.LEMMATIZED_STORY)
df_test["TOKENS"] = tokenizer.texts_to_sequences(df_test.LEMMATIZED_STORY)

In [24]:
df

Unnamed: 0,STORY,SECTION,LEMMATIZED_STORY,TOKENS
0,painful huge reversal fee income unheard priva...,3,painful huge reversal fee income unheard priva...,"[8023, 705, 6589, 1488, 395, 24136, 488, 459, ..."
1,formidable opposition alliance congress jharkh...,0,formidable opposition alliance congress jharkh...,"[4758, 262, 94, 9, 1954, 10608, 6245, 14205, 1..."
2,asian currencies trading lower today south kor...,3,asian currency trading lower today south korea...,"[1042, 276, 314, 341, 157, 215, 1073, 238, 259..."
3,want answer question click answer clicking ans...,1,want answer question click answer clicking ans...,"[79, 1260, 439, 1808, 1260, 3007, 1260, 1127, ..."
4,global markets gold prices edged today disappo...,3,global market gold price edged today disappoin...,"[107, 15, 343, 26, 5426, 157, 3779, 154, 1758,..."
...,...,...,...,...
7623,karnataka congress bastion gave bjp government...,0,karnataka congress bastion gave bjp government...,"[562, 9, 2397, 814, 10, 21, 1941, 16, 2620, 28..."
7624,film features janhvi kapoor revolve singing gh...,2,film feature janhvi kapoor revolve singing gho...,"[17, 31, 5546, 963, 7166, 4689, 4756, 2042, 23..."
7625,database created bringing criminal records sta...,1,database created bringing criminal record stat...,"[2132, 740, 1231, 1438, 426, 16, 799, 7834, 13..."
7626,state uneasy relationship mainland days late p...,0,state uneasy relationship mainland day late pr...,"[16, 11259, 1216, 9524, 37, 547, 20861, 39, 20..."


In [25]:
train = sequence.pad_sequences(df.TOKENS, maxlen=256)
test = sequence.pad_sequences(df_test.TOKENS, maxlen=256)
# train = df.TOKENS
# test = df_test.TOKENS

train.shape, test.shape

((7628, 256), (2748, 256))

In [26]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

labels = to_categorical(df.SECTION, num_classes=len(set(df.SECTION)))

x_train, x_test, y_train, y_test = train_test_split(train, labels, test_size=0.2)

In [27]:
x_train

array([[   0,    0,    0, ..., 1105, 1702, 2374],
       [   0,    0,    0, ..., 1404, 1437,  305],
       [   0,    0,    0, ..., 5726, 8891,  285],
       ...,
       [   0,    0,    0, ..., 6496,  525, 2963],
       [   0,    0,    0, ..., 1746,  474,  440],
       [   0,    0,    0, ..., 2448, 9196, 2115]], dtype=int32)

In [28]:
# Create a model

In [29]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Flatten

In [32]:
model = Sequential()

vocab = int(vectors.shape[0]) + 1

model.add(Embedding(vocab, vectors.shape[1], weights = [vectors].append([vectors][-1]), input_length = 256))
# model.add(Flatten())

model.add(LSTM(128, dropout=0.2))

model.add(Dense(512, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(256, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.2))

# model.add(Flatten(64))

model.add(Dense(4, activation="softmax"))

In [33]:
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [34]:
model.fit(x_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f08cbd7e810>

In [35]:
model.evaluate(x_test, y_test)



[0.19010424613952637, 0.9560943841934204]

In [37]:
pred_list = model.predict(test)

In [39]:
final_pred = []
for pred_arr in pred_list:
    pred_cat = list(pred_arr).index(max(pred_arr))
    final_pred.append(pred_cat)

final_pred

[1,
 2,
 1,
 0,
 1,
 1,
 1,
 2,
 1,
 2,
 0,
 3,
 2,
 1,
 2,
 1,
 3,
 2,
 3,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 3,
 3,
 0,
 1,
 3,
 2,
 0,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 1,
 3,
 2,
 2,
 1,
 1,
 1,
 0,
 1,
 3,
 2,
 1,
 2,
 2,
 0,
 1,
 1,
 0,
 1,
 1,
 2,
 3,
 2,
 1,
 1,
 2,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 3,
 0,
 3,
 0,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 0,
 2,
 2,
 2,
 1,
 0,
 1,
 3,
 1,
 1,
 2,
 0,
 2,
 3,
 1,
 1,
 2,
 2,
 1,
 1,
 3,
 3,
 2,
 1,
 1,
 3,
 2,
 2,
 0,
 3,
 3,
 3,
 2,
 1,
 1,
 2,
 1,
 0,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 2,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 0,
 0,
 0,
 2,
 0,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 3,
 1,
 2,
 1,
 1,
 3,
 1,
 2,
 2,
 1,
 3,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 3,
 2,
 0,
 2,
 2,
 3,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 3,
 1,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 3,


In [41]:
new_df = pd.DataFrame(final_pred, columns=["SECTION"])

In [42]:
new_df.to_excel("")

Unnamed: 0,SECTION
0,1
1,2
2,1
3,0
4,1
...,...
2743,1
2744,1
2745,1
2746,0


In [43]:
ss = pd.read_excel("archive/Sample_submission.xlsx")
ss

Unnamed: 0,SECTION
0,3
1,3
2,3
3,3
4,3
...,...
2743,2
2744,2
2745,2
2746,2
