## Climate Change Misinformation Detection

### 1) Reading the Data 

External data from Buzzfeed has been added as negative training data.

In [46]:
import json
import numpy as np

def get_data_from_file(filepath,ty):
    with open(filepath) as jsonfile:
        data = json.load(jsonfile)
    sentences = []
    labels = []
    size = len(data)
    for i in range(size):
        sentences.append(data[ty+'-'+str(i)]['text'])
        if'label' in data[ty+'-'+str(i)].keys():
            labels.append(data[ty+'-'+str(i)]['label'])
    return sentences,labels 

train_sents_posi,y_train_posi = get_data_from_file("train.json","train")
train_added,y_train_added = get_data_from_file("buzzfeed.json","train")
train_sents = train_sents_posi + train_added       
y_train = y_train_posi + y_train_added

dev_sents,y_dev = get_data_from_file("dev.json","dev")
test_sents,y_test = get_data_from_file("test-unlabelled.json","test")

y_train = np.array(y_train)
y_dev = np.array(y_dev)
print(len(train_sents))

2548


### 2) Preprocessing the Data

In [11]:
#import spacy
#from spacy.lang.en import English 
# Preprocess the documents
# Tokenize the documents and remove the stopwords and find lemma
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet

# Preprocess the data
#nlp = spacy.load("en_core_web_sm")
stopwords = set(stopwords.words('english'))
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
  
def get_wordnet_pos(word):
    tag = nltk.tag.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def get_lemma(word):
    lemma = lemmatizer.lemmatize(word.lower(),get_wordnet_pos(word))
    return lemma
      
def get_token(text):
    processed_tokens = []
    tokens = nltk.word_tokenize(text)
    for token in tokens:
        if token.lower() in stopwords:
            continue
        else:
            processed_tokens.append(token)
    return processed_tokens


def preprocess_data(text):
    tokens = get_token(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

### 3) Topic Modelling on the training documents

In [48]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

clean_doc = [preprocess_data(doc) for doc in train_sents]
dictionary = corpora.Dictionary(clean_doc)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_doc]


In [61]:
import time
from gensim.models.ldamodel import LdaModel as Lda

start_time = time.time()
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)
train_time = time.time() - start_time
print("Training Time --- %s seconds " % (round(train_time, 2)))

Training Time --- 459.09 seconds 


In [64]:
print('\nPerplexity: ', ldamodel.log_perplexity(doc_term_matrix))


Perplexity:  -8.25125414101154


In [63]:
#Print all the topics
for topic in ldamodel.print_topics(num_topics=5, num_words=10):
    print(topic[0]," ",topic[1],"\n")

0   0.025*"climate" + 0.013*"change" + 0.007*"global" + 0.007*"warm" + 0.007*"energy" + 0.006*"temperature" + 0.005*"would" + 0.005*"carbon" + 0.005*"emission" + 0.005*"science" 

1   0.034*"clinton" + 0.034*"trump" + 0.015*"debate" + 0.012*"hillary" + 0.011*"campaign" + 0.008*"donald" + 0.008*"state" + 0.007*"presidential" + 0.006*"voter" + 0.006*"republican" 

2   0.008*"obama" + 0.008*"state" + 0.007*"president" + 0.007*"would" + 0.005*"country" + 0.005*"security" + 0.005*"court" + 0.004*"people" + 0.004*"government" + 0.004*"syria" 

3   0.017*"police" + 0.008*"people" + 0.007*"officer" + 0.007*"adult" + 0.007*"black" + 0.006*"hawkins" + 0.006*"thing" + 0.005*"charlotte" + 0.005*"young" + 0.004*"video" 

4   0.028*"trump" + 0.010*"republican" + 0.008*"would" + 0.007*"donald" + 0.007*"president" + 0.007*"photo" + 0.006*"people" + 0.005*"white" + 0.005*"caption" + 0.005*"national" 



In [110]:
# Get the topic of the document by the trained model
def categorize_topic(doc):
    clean_for_lda = preprocess_data(doc)
    doc_bow = dictionary.doc2bow(clean_for_lda)
    topic = ldamodel.get_document_topics(doc_bow)
    return topic

def classify_topic(docs):
    labels = {}
    for i in range(len(docs)):
        topic = categorize_topic(docs[i])
        if len(topic) < 2:
            if topic[0][0] != 0:
                labels[i] = 0
        else:
            max = 0
            for j in range(1,len(topic)):
                if topic[j][1] > topic[max][1]:
                    max = j
            if max != 0:
                labels[i] = 0
                
    return labels


dev_topic_label = classify_topic(dev_sents)
test_topic_label = classify_topic(test_sents)
#print(test_topic_label)


print(categorize_topic(test_sents[0]))

[(0, 0.6309511), (2, 0.1567901), (3, 0.08145625), (4, 0.12921995)]


In [125]:
# Get the remaining unclassified documents
def get_the_rest_docs(original_docs,assigned_topics):
    docs_idx = []
    docs = []
    for i in range(len(original_docs)):
        if i not in assigned_topics.keys():
            docs_idx.append(i)
            docs.append(original_docs[i])
    return docs_idx, docs
        
remaining_dev_idx, remaining_dev = get_the_rest_docs(dev_sents,dev_topic_label)
remaining_test_idx, remaining_test = get_the_rest_docs(test_sents,test_topic_label)
#print(len(remaining_dev_idx))
#print(remaining_dev)

### 4) Build a classifier

In [137]:
from keras.preprocessing.text import Tokenizer

test1 = train_sents
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(test1)
x_train = tokenizer.texts_to_matrix(test1, mode="count") 
x_dev = tokenizer.texts_to_matrix(remaining_dev, mode="count")
x_test = tokenizer.texts_to_matrix(remaining_test, mode="count")

print(x_train)
vocab_size = x_train.shape[1]
print("Vocab size =", vocab_size)


[[ 0.  0. 10. ...  0.  0.  0.]
 [ 0.  0. 60. ...  0.  0.  0.]
 [ 0.  0. 32. ...  0.  0.  0.]
 ...
 [ 0.  0. 83. ...  0.  0.  0.]
 [ 0.  0. 11. ...  0.  0.  0.]
 [ 0.  0. 32. ...  1.  1.  1.]]
Vocab size = 48567


In [141]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(x_train, y_train)
y_dev_remaining = classifier.predict(x_dev)
#score = classifier.score(x_dev, y_dev)

y_test_remaining = classifier.predict(x_test)



82


In [7]:
# Fit a feed forward neural network model
from keras.models import Sequential
from keras import layers

vocab_size = xx_test.shape[1]
model = Sequential(name="feedforward-bow-input")
model.add(layers.Dense(10, input_dim=vocab_size, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

#since it's a binary classification problem, we use a binary cross entropy loss here
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Instructions for updating:
Colocations handled automatically by placer.
Model: "feedforward-bow-input"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 10)                485680    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 485,691
Trainable params: 485,691
Non-trainable params: 0
_________________________________________________________________


In [9]:
#training
model.fit(xx_test, y_train, epochs=20, verbose=True, validation_data=(xx_dev, y_dev), batch_size=10)

loss, accuracy = model.evaluate(xx_dev, y_dev, verbose=False)
print("\nTesting Accuracy:  {:.4f}".format(accuracy))

Instructions for updating:
Use tf.cast instead.
Train on 2548 samples, validate on 100 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Testing Accuracy:  0.6700


In [20]:
y_result = model.predict(x_test)
y_result = y_result.tolist()
y_test = []
for it in y_result:
    for i in it:
        y_test.append(round(i))
#print(y_test)

### Output the results

In [None]:
# Compute the result labels
def collect_labels(docs_length,topic_label,classified_idx,classified_label):
    y_result = [1]*docs_length
    for i in range(len(y_result)):
        if i in topic_label.keys():
            y_result[i] = topic_label[i]
        if i in classified_idx:
            y_result[i] = classified_label[classified_idx.index(i)]
    return y_result

y_dev_result = collect_labels(len(dev_sents),dev_topic_label,remaining_dev_idx, y_dev_remaining)
#y_test = collect_labels(len(test_sents),test_topic_label,remaining_test_idx,y_test_remaining)
#print(y_test)

In [153]:
# Output file
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)
        
def output_file(cat,labels):
    dict1 = {}
    dict2 = {}
    for i in range(len(labels)):
        dict2["label"] = labels[i]
        emu = cat + "-" + str(i)
        dict1[emu] = dict(dict2)
    out_file = open(cat + "-output.json","w")
    json.dump(dict1,out_file,cls=NpEncoder)
    out_file.close()
    
    
#output_file("test",y_test)
output_file("dev",y_dev_result)
