In [1]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import numpy as np
import scipy
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from random import sample
import re
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
print('Loading and Processing Data ...')
# df = pd.read_csv('../data/filtered_all.csv')
# df2 = df.groupby('user_screen_name').head(1000).reset_index(drop=True)
df2 = pd.read_csv('../data/media_1k_per.csv')
coor = pd.read_csv('../data/Media_coordinate.csv')


Loading and Processing Data ...


In [2]:
def assign_coor(df, coor):
    media2bias = dict(zip(coor.Source, coor.Bias))
    df['bias'] = df['user_screen_name'].map(media2bias)
    df['bias'] = df['bias']/df['bias'].abs().max()
    media2qual = dict(zip(coor.Source, coor.Quality))
    df['quality'] = df['user_screen_name'].map(media2qual)
    df['quality'] = df['quality'] - df['quality'].abs().min()
    df['quality'] = df['quality']/df['quality'].abs().max()
    return df
    
df = assign_coor(df2, coor)
articles = []
labels = []
df = df.sample(frac=1,replace=True).reset_index(drop=True)
def remove_urls (vTEXT):
    vTEXT = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', vTEXT, flags=re.MULTILINE)
    return(vTEXT)

for index, row in df.iterrows():
    labels.append(row[['bias','quality']])
    article = row['text']
    for word in STOPWORDS:
        token = ' ' + word + ' '
        article = article.replace(token, ' ')
        article = article.replace(' ', ' ')
        article = remove_urls(article)
    articles.append(article)
    
# print(len(labels))
# print(len(articles))
# print((articles[0],labels[0]))

print('Loading and Processing Data Complete!')
vocab_size = 5000
embedding_dim = 100
max_length = 100
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

Loading and Processing Data Complete!


In [3]:
train_size = .5

train_size0 = len(articles) * train_size
train_articles = articles[0: int(train_size0)]
# train_index = sample(list(range(len(articles))), int(train_size0))
# train_articles = [articles[i] for i in train_index]
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index


train_labels = labels[0: int(train_size0)]
# train_labels = [labels[i] for i in train_index]
train_sequences = tokenizer.texts_to_sequences(train_articles)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
validation_articles = articles[int(train_size0):]
# validation_articles = articles
validation_labels = labels[int(train_size0):]
# validation_labels = labels
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
training_label_seq = np.array(train_labels).astype('float32')
validation_label_seq = np.array(validation_labels).astype('float32')

In [4]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(2)
])

model.compile(optimizer='adam', loss="mse")
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=1)
# model.fit(train_padded, training_label_seq, epochs=num_epochs, verbose=1)

Train on 32063 samples, validate on 32064 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [5]:
sequences = tokenizer.texts_to_sequences(articles)
sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
output = model.predict(sequences)
bias_pred = output[:,0]
qual_pred = output[:,1]
# df['bias_pred'] = bias_pred
# df['qual_pred'] = qual_pred

In [6]:
df_test = df
# df_test['bias_pred'] = df_test['bias']
# df_test['qual_pred'] = df_test['quality']
# df_test['bias_pred'].iloc[int(train_size0):] = bias_pred[int(train_size0):]
# df_test['qual_pred'].iloc[int(train_size0):] = qual_pred[int(train_size0):]

df_test['bias_pred'] = bias_pred
df_test['qual_pred'] = qual_pred

# df_test['qual_pred'] = df_test['quality']
# df_test['qual_pred'].iloc[:int(train_size0)] = qual_pred[:int(train_size0)]
# coor = coor.sort_values(by = ['Source'])
df_test = df_test[int(train_size0):]

average_bias = df_test.groupby(['user_screen_name']).bias_pred.mean()
average_qual = df_test.groupby(['user_screen_name']).qual_pred.mean()
bias = df_test.groupby(['user_screen_name']).bias.mean()
quality = df_test.groupby(['user_screen_name']).quality.mean()

corr_bias = scipy.stats.pearsonr(average_bias.tolist(), bias.tolist())[0]
corr_quality = scipy.stats.pearsonr(average_qual.tolist(), quality.tolist())[0]

print(corr_bias, corr_quality)

0.8124385631814912 0.8245349749685975


In [122]:
model0 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Activation('sigmoid')
])

model0.compile(optimizer='adam', loss="mse")
num_epochs = 10
# history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=1)
model0.fit(train_padded, training_label_seq[:,0], epochs=num_epochs, verbose=1)

model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Activation('sigmoid')
])

model1.compile(optimizer='adam', loss="mse")
num_epochs = 10
# history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=1)
model1.fit(train_padded, training_label_seq[:,1], epochs=num_epochs, verbose=1)

ValueError: A target array with shape (32063, 1) was passed for an output of shape (None, 100) while using as loss `mean_squared_error`. This loss expects targets to have the same shape as the output.

In [112]:
bias_pred = model0.predict(sequences)
qual_pred = model1.predict(sequences)

In [113]:
df_test = df
df_test['bias_pred'] = bias_pred
df_test['qual_pred'] = qual_pred

# coor = coor.sort_values(by = ['Source'])
# df_test = df_test[int(train_size0):]

average_bias = df_test.groupby(['user_screen_name']).bias_pred.mean()
average_qual = df_test.groupby(['user_screen_name']).qual_pred.mean()
bias = df_test.groupby(['user_screen_name']).bias.mean()
quality = df_test.groupby(['user_screen_name']).quality.mean()

corr_bias = scipy.stats.pearsonr(average_bias.tolist(), bias.tolist())[0]
corr_quality = scipy.stats.pearsonr(average_qual.tolist(), quality.tolist())[0]

print(corr_bias, corr_quality)

nan nan




In [125]:
model0 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Activation('sigmoid')
])
model0.output

<tf.Tensor 'activation_4/Identity:0' shape=(None, 100) dtype=float32>

In [126]:
embedding_dim

100