<a href="https://colab.research.google.com/github/yashdusing/100-Days-Of-ML-Code/blob/master/LSTM_and_CNN_fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import random

def load_kagglefakenews():
    #load training data and put into arrays
    df = pd.read_csv('train.csv', encoding='utf8', engine='python', error_bad_lines=False) # be sure to point to wherever you put your file
    train_data = df['text'].values.tolist() #'text' column contains articles
    train_labels = df['label'].values.tolist() #'label' column contains labels

    #Randomly shuffle data and labels together
    combo = list(zip(train_data, train_labels))
    random.shuffle(combo)
    train_data, train_labels = zip(*combo)
    del df #clear up memory

    return np.asarray(train_data).tolist(), np.asarray(train_labels).tolist()

In [27]:
train_data, train_labels = load_kagglefakenews()

Skipping line 12954: unexpected end of data


In [30]:
print(train_labels[:5])

[1, 0, 0, 1, 1]


In [34]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.utils import to_categorical
import pickle

MAX_NB_WORDS=50000 #dictionary size
MAX_SEQUENCE_LENGTH=1500 #max word length of each individual article
EMBEDDING_DIM=300 #dimensionality of the embedding vector (50, 100, 200, 300)
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')

def tokenize_trainingdata(texts, labels):
    tokenizer.fit_on_texts(texts)
    pickle.dump(tokenizer, open('Models/tokenizer.p', 'wb'))

    sequences = tokenizer.texts_to_sequences(texts)
    print(sequences[0])

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = to_categorical(labels, num_classes=len(set(labels)))

    return data, labels, word_index

#and run it
X, Y, word_index = tokenize_trainingdata(train_data, train_labels)   

[17574, 24, 6219, 138, 6, 99, 7340, 1670, 1, 16196, 5525, 12195, 5452, 159, 1082, 10580, 28, 5453, 15, 189, 3, 1, 1849, 322, 866, 5, 208, 7, 5454, 75, 736, 2419, 139, 261, 4, 534, 772, 1750, 9, 123, 114, 197, 1849, 66, 4681, 119, 94, 347, 39, 12196, 3073, 2, 123, 114, 9, 510, 18, 10580, 10, 389, 577, 145, 2, 2777, 12, 3450, 37, 54, 6572, 39, 831, 627, 2, 475, 2, 5146, 3055, 4681, 1, 119, 10, 479, 123, 114, 68, 2239, 166, 39, 2567, 897, 568, 4177, 3073, 2, 1, 123, 114, 9, 5, 510, 136, 93, 1026, 10, 1, 911, 10, 491, 610, 66, 3073, 11335, 39, 10, 25, 3451, 485, 297, 39, 13, 5, 22428, 1000, 3, 627, 11, 179, 2, 5, 331, 18, 3073, 150, 38, 911, 380, 1, 93, 1237, 2, 1245, 21, 1, 9568, 1167, 11, 9494, 1, 119, 700, 3073, 590, 5, 331, 22429, 39, 297, 7747, 2239, 39, 8, 5, 644, 1000, 3, 1, 80, 3, 627, 42, 41, 162, 516, 2, 2120, 1, 196, 4453, 4, 821, 6, 69, 924, 1, 721, 1762, 173, 1, 98, 87, 4, 627, 8, 453, 10, 1495, 1630, 1495, 966, 4, 5, 1495, 7035, 11070, 893, 7, 68, 2239, 39, 4, 33, 41, 483, 2,

In [0]:
train_data = X[:int(len(X)*0.9)]
train_labels = Y[:int(len(X)*0.9)]
test_data = X[int(len(X)*0.9):int(len(X)*0.95)]
test_labels = Y[int(len(X)*0.9):int(len(X)*0.95)]
valid_data = X[int(len(X)*0.95):]
valid_labels = Y[int(len(X)*0.95):]

In [47]:
def load_embeddings(word_index, embeddingsfile='wordEmbeddings/glove.6B.%id.txt' %EMBEDDING_DIM):
    embeddings_index = {}
    f = open(embeddingsfile, 'r', encoding='utf8')
    for line in f:
        #here we parse the data from the file
        values = line.split(' ') #split the line by spaces
        word = values[0] #each line starts with the word
        coefs = np.asarray(values[1:], dtype='float32') #the rest of the line is the vector
        embeddings_index[word] = coefs #put into embedding dictionary
    f.close()

    print('Found %s word vectors.' % len(embeddings_index))

    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    return embedding_layer
    
#and build the embedding layer
embedding_layer = load_embeddings(word_index)

Found 400000 word vectors.


In [0]:
from keras import Sequential, Model, Input
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Flatten, Dense, GlobalAveragePooling1D, Dropout, LSTM, CuDNNLSTM, RNN, SimpleRNN, Conv2D, GlobalMaxPooling1D
from keras import callbacks

def baseline_model(sequence_input, embedded_sequences, classes=2):
    x = Conv1D(64, 5, activation='relu')(embedded_sequences)
    x = MaxPooling1D(5)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(5)(x)
    x = Conv1D(256, 2, activation='relu')(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(2048, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(classes, activation='softmax')(x)

    model = Model(sequence_input, preds)
    return model

In [50]:
#put embedding layer into input of the model
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

model = baseline_model(sequence_input, embedded_sequences, classes=2)

model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['acc'])

print(model.summary())

model.fit(train_data, train_labels, validation_data=(valid_data, valid_labels), epochs=25, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1500)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1500, 300)         56991900  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1496, 64)          96064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 299, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 297, 128)          24704     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 59, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 58, 256)           65792     
__________

<keras.callbacks.History at 0x7fc84ef9c4e0>

In [51]:
model.evaluate(test_data, test_labels)



[0.18419567021855085, 0.9783950617283951]