In [95]:
from __future__ import print_function

import os
import sys
import numpy as np

import pickle

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras import layers, Sequential
from keras.utils import to_categorical
from keras.initializers import Constant

import pandas as pd

In [2]:
BASE_DIR = ''
GLOVE_DIR = BASE_DIR
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 200
MAX_NUM_WORDS = 1000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [50]:
dataset = pd.read_csv('dataset.csv')

In [51]:
dataset.head()

Unnamed: 0,title,summary,text,author,link,published,category,id
0,Watching My Cousin’s Death Go Viral,My cousin Salahuddin was a victim of unforgiva...,The date in the grainy video footage says “Jul...,Bilal Anwar,https://www.buzzfeednews.com/article/bilalanwa...,"Mon, 06 Jan 2020 18:56:35 -0500",health,0.0
1,17 Real Weight Loss Tips From People Who Lost ...,"Real tips, real people, and real results that'...",Submissions have been edited for length and cl...,Spencer Althouse,https://www.buzzfeed.com/spenceralthouse/weigh...,"Tue, 07 Jan 2020 05:25:37 -0500",health,1.0
2,I Reviewed Popular Blenders On The Market Toda...,"I compared Oster, Ninja, and Vitamix.",The Vitamix blender was given to BuzzFeed for ...,Krista Torres,https://www.buzzfeed.com/kristatorres/i-tested...,"Fri, 03 Jan 2020 11:25:46 -0500",health,2.0
3,Disinformation For Hire: How A New Breed Of PR...,One firm promised to “use every tool and take ...,This story was reported in partnership with th...,Craig Silverman,https://www.buzzfeednews.com/article/craigsilv...,"Tue, 07 Jan 2020 11:25:45 -0500",politics,2.0
4,Disinformation For Hire: How A New Breed Of PR...,One firm promised to “use every tool and take ...,This story was reported in partnership with th...,Craig Silverman,https://www.buzzfeednews.com/article/craigsilv...,"Tue, 07 Jan 2020 11:25:45 -0500",tech,2.0


In [52]:
dataset.to_csv('dataset_full.txt')

In [54]:
dataset = dataset.drop(['title', 'summary', 'author', 'link', 'published', 'id'], axis=1)

In [152]:
dataset.head()

Unnamed: 0,text,category
0,The date in the grainy video footage says “Jul...,health
1,Submissions have been edited for length and cl...,health
2,The Vitamix blender was given to BuzzFeed for ...,health
3,This story was reported in partnership with th...,politics
4,This story was reported in partnership with th...,tech


In [126]:
texts = dataset.text.values.astype('str')
labels = dataset.category.values.astype('str')
labels_index = list(np.unique(dataset.category.values))

In [127]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 27310 unique tokens.


In [128]:
with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)

In [129]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

le = LabelEncoder()
labels = le.fit_transform(labels)
labels = to_categorical(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (1250, 200)
Shape of label tensor: (1250, 5)


In [130]:
X_train, X_test, y_train, y_test \
    = train_test_split(data, labels, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Creating Embedding Matrix

In [116]:
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros(shape=(num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [154]:
with open('embeddings.pickle', 'wb') as f:
    pickle.dump(embedding_matrix, f)

## Scikit`s SVC

In [104]:
X = []
for row in X_train:
    arr = []
    for foo in row:
        arr.extend(embedding_matrix[foo])
    X.append(arr)
X_train = np.array(X)

In [105]:
X = []
for row in X_test:
    arr = []
    for foo in row:
        arr.extend(embedding_matrix[foo])
    X.append(arr)
X_test = np.array(X)

In [106]:
X_test.shape

(250, 20000)

In [108]:
X_train.shape

(800, 20000)

In [117]:
model = SVC()

In [118]:
model.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [119]:
y_pred = model.predict(X_test)

In [120]:
accuracy_score(y_test, y_pred)

0.192

## Tensorflow

In [145]:
model = Sequential([
    layers.Embedding(
        num_words,
        EMBEDDING_DIM,
        input_shape=(MAX_SEQUENCE_LENGTH,),
        embeddings_initializer=Constant(embedding_matrix),
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False
    ),
    layers.Conv1D(128, 5, activation='relu'),
    layers.MaxPooling1D(5),
    layers.Conv1D(128, 5, activation='relu'),
    layers.MaxPooling1D(5),
    layers.Conv1D(128, 5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(labels_index), activation='softmax')
])

In [146]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [150]:
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          validation_data=(X_val, y_val))

Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1a3f4f2e50>

In [151]:
loss, acc = model.evaluate(X_test, y_test)
print('Accuracy is', acc)

Accuracy is 0.6600000262260437


In [155]:
model.save('model.h5')