### Extract and clean sample data

In this notebook, we will analyze and build a neural network to predict
fake and reliable news based on the sample dataset which includes only
2000 news articles.

In [11]:
import pandas as pd
import numpy as np

from words_clean_function import denoise_text, normalize
from nltk import word_tokenize

from gensim import corpora, models

In [15]:
data_path = 'D:\\PycharmProjects\\springboard\\data\\'

# Skip to 924500 row for the good mix of fake and reliable news
skiprows = 924500
nrows = 2000

# Read in sample data
df = pd.read_csv(f'{data_path}news_clean_1.csv',
                 skiprows=skiprows,
                 index_col=False,
                 nrows=nrows,
                 names=['index', 'type', 'content'])

# Dropping index columns
df = df.drop('index', axis=1)
df.type.value_counts()

reliable    1001
fake         999
Name: type, dtype: int64

In [16]:
df.content = df.content.map(denoise_text)
df.content = df.content.map(word_tokenize)
df.content = df.content.map(normalize)

df.head()

Unnamed: 0,type,content
0,fake,"[know, liberal, still, scratching, head, aanyo..."
1,fake,"[account, suspended, contact, hosting, provide..."
2,fake,"[frankenas, nonapology, senate, floor, roy, mo..."
3,fake,"[0, washington, dc, youare, hard, time, afford..."
4,fake,"[0, boston, massachusetts, weave, heard, overw..."


Gensim will be the package of choice for the vectorization step as it
will scale quite well with the bigger data set.
Moreover, Gensim has the save to disk feature which is quite neat!

In [94]:
from gensim.matutils import corpus2dense, corpus2csc

# Create a word lexicon
dictionary = corpora.Dictionary(df.content)
print(dictionary)

# bag of words
bow = []
for doc in df.content:
    bow.append(dictionary.doc2bow(doc))

# Create term frequency matrix
tf_sparse_array = corpus2csc(bow)

# Word2Vec
# This does not work

# tf-idf
# Again this does not work with deep learning input!

Dictionary(52800 unique tokens: ['45th', 'aanyonea', 'abinders', 'action', 'afascista']...)


Separating train and test set.
### will separate when model runs smoothly

In [99]:
# Setting X and y
X = tf_sparse_array
y = df.type.astype('category').cat.codes
print(X.shape)

(52800, 2000)


### Simple Deep Learning models

In [108]:
# Model builds
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=52800))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X.T,y, epochs=5)

Epoch 1/5


InvalidArgumentError:  TypeError: 'SparseTensor' object is not subscriptable
Traceback (most recent call last):

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\ops\script_ops.py", line 241, in __call__
    return func(device, token, args)

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\ops\script_ops.py", line 130, in __call__
    ret = self._func(*args)

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 309, in wrapper
    return func(*args, **kwargs)

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 513, in py_method
    return [slice_array(inp) for inp in flat_inputs]

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 513, in <listcomp>
    return [slice_array(inp) for inp in flat_inputs]

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 512, in slice_array
    contiguous=contiguous)

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\keras\engine\training_utils.py", line 391, in slice_arrays
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\keras\engine\training_utils.py", line 391, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

  File "d:\pycharmprojects\springboard\venv\lib\site-packages\tensorflow\python\keras\engine\training_utils.py", line 391, in <listcomp>
    entries = [[x[i:i + 1] for i in indices] for x in arrays]

TypeError: 'SparseTensor' object is not subscriptable


	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_2338]

Function call stack:
train_function


In [105]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X.T, y)
clf.score(X.T,y)

1.0

In [7]:
df.to_csv(f'{data_path}news_clean_sample.csv')

