In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [2]:
tf.__version__

'2.0.0'

In [3]:
train_df = pd.read_csv("news_train.csv")#here we have the dataset we extracted
train_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
Y = train_df['label']

In [5]:
group1 = pd.DataFrame(train_df.groupby(['author']).size()).reset_index()
group1.head()

Unnamed: 0,author,0
0,# 1 NWO Hatr,17
1,-NO AUTHOR-,54
2,10 Habits That Will Make Your Life Easier &amp...,1
3,10 More Beautiful Images That Remind You We St...,1
4,10 Movies That Could Change Your Understanding...,1


In [6]:
group2 = pd.DataFrame(train_df.groupby(['author'])['label'].sum()).reset_index()
group2.head()

Unnamed: 0,author,label
0,# 1 NWO Hatr,17
1,-NO AUTHOR-,54
2,10 Habits That Will Make Your Life Easier &amp...,1
3,10 More Beautiful Images That Remind You We St...,1
4,10 Movies That Could Change Your Understanding...,1


In [7]:
group1.columns = ['author','count']
group1.sort_values(by=['count'], ascending=False).head()

Unnamed: 0,author,count
2944,Pam Key,243
3929,admin,193
1762,Jerome Hudson,166
724,Charlie Spiering,141
1857,John Hayward,140


In [8]:
group1[group1['author']=='Starkman']

Unnamed: 0,author,count
3518,Starkman,84


In [9]:
group2.sort_values(by=['label'], ascending=False).head()

Unnamed: 0,author,label
3929,admin,193
2939,Pakalert,86
1111,Eddy Lavine,85
3518,Starkman,84
1376,Gillian,82


In [10]:
merge_groups = pd.merge(group1,group2, on='author')
merge_groups.head()

Unnamed: 0,author,count,label
0,# 1 NWO Hatr,17,17
1,-NO AUTHOR-,54,54
2,10 Habits That Will Make Your Life Easier &amp...,1,1
3,10 More Beautiful Images That Remind You We St...,1,1
4,10 Movies That Could Change Your Understanding...,1,1


In [11]:
merge_groups['prob_fake'] = merge_groups['label']/merge_groups['count']
merge_groups['prob_fake'].describe()

count    4201.000000
mean        0.470771
std         0.499016
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: prob_fake, dtype: float64

In [12]:
merge_groups[merge_groups['author']=='Pam Key']['prob_fake']

2944    0.004115
Name: prob_fake, dtype: float64

In [13]:
train_df['title_lower'] = train_df["title"].str.lower()
train_df['title_no_punctuation'] = train_df['title_lower'].str.replace('[^\w\s]','')
train_df['title_no_punctuation'] = train_df["title_no_punctuation"].fillna("fillna")

train_df['text_lower'] = train_df["text"].str.lower()
train_df['text_no_punctuation'] = train_df['text_lower'].str.replace('[^\w\s]','')
train_df['text_no_punctuation'] = train_df["text_no_punctuation"].fillna("fillna")

train_df['author_lower'] = train_df["author"].str.lower()
train_df['author_no_spaces'] = train_df['author_lower'].str.replace(' ','_')

In [14]:
train_df.head()

Unnamed: 0,id,title,author,text,label,title_lower,title_no_punctuation,text_lower,text_no_punctuation,author_lower,author_no_spaces
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide: we didn’t even see comey’s let...,house dem aide we didnt even see comeys letter...,house dem aide: we didn’t even see comey’s let...,house dem aide we didnt even see comeys letter...,darrell lucus,darrell_lucus
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"flynn: hillary clinton, big woman on campus - ...",flynn hillary clinton big woman on campus bre...,ever get the feeling your life circles the rou...,ever get the feeling your life circles the rou...,daniel j. flynn,daniel_j._flynn
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,why the truth might get you fired,why the truth might get you fired,"why the truth might get you fired october 29, ...",why the truth might get you fired october 29 2...,consortiumnews.com,consortiumnews.com
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 civilians killed in single us airstrike hav...,15 civilians killed in single us airstrike hav...,videos 15 civilians killed in single us airstr...,videos 15 civilians killed in single us airstr...,jessica purkiss,jessica_purkiss
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jailed for fictional unpublished...,iranian woman jailed for fictional unpublished...,print \nan iranian woman has been sentenced to...,print \nan iranian woman has been sentenced to...,howard portnoy,howard_portnoy


In [15]:
train_df['author_no_spaces'].head()

0         darrell_lucus
1       daniel_j._flynn
2    consortiumnews.com
3       jessica_purkiss
4        howard_portnoy
Name: author_no_spaces, dtype: object

In [16]:
max_features=5000 #we set maximum number of words to 5000
maxlen=400 #we set maximum sequence length to 400

In [17]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step
tok.fit_on_texts(list(train_df['text_no_punctuation'])+list(train_df['title_no_punctuation'])+list(train_df['author_no_spaces'].astype(str))) #fit to cleaned text

In [18]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1

216068


In [19]:
text_df = tok.texts_to_sequences(list(train_df['text_no_punctuation'])) #this is how we create sequences
text_df = tf.keras.preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) #let's execute pad step

title_df = tok.texts_to_sequences(list(train_df['title_no_punctuation'])) #this is how we create sequences
title_df = tf.keras.preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)

author_df = tok.texts_to_sequences(list(train_df['author_no_spaces'].astype(str))) #this is how we create sequences
author_df = tf.keras.preprocessing.sequence.pad_sequences(author_df, maxlen=maxlen)

In [20]:
train_df = author_df + title_df + text_df

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [22]:
embedding_dim = 50

In [25]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 50)           10803450  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 400, 100)          40400     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 64)                34048     
_________________________________________________________________
dense_3 (Dense)              (None, 50)                3250      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51        
Total params: 10,881,199
Trainable params: 10,881,199
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',#no more categorical_crossentropy
              metrics=['accuracy'])

In [27]:
model.fit(np.array(X_train), np.array(y_train), epochs=1)

Train on 18720 samples


<tensorflow.python.keras.callbacks.History at 0x7fe3202dbc50>