# Answers to coding questions in deep learning
Peeter Niidas

In [1]:
# Make some preparations

import glob
import json
import pandas as pd
import numpy as np
import re
import spacy
from gensim.models import Word2Vec
import os
import sys
from time import time

import matplotlib.pyplot as plt
#%tensorflow_version 1.x

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

1. Download either the PHEME5 or PHEME9 dataset.
2. Choose any one event from the dataset and load its first 100 JSON files into
python memory.
3. Extract and load attributes such as source tweets, reply tweets, and favorites
count, labels of the source tweets.

In [2]:
# Get filenames and load data

def read_rename_data (dirr, length):
    path = dirr
    df = pd.DataFrame()
    for filename in glob.iglob(path + '**/*.json', recursive=True):
        if 'source-tweet' in filename:
            i = 1
            f = open(filename)
            data = json.load(f)
            new_row = {'source_tweet': data['text'], 'favorite_count': data['favorite_count'], 'retweet_count':data['retweet_count']}
            df = df.append(new_row, ignore_index=True)
            f.close()
        elif 'reactions' in filename:
            f = open(filename)
            data = json.load(f)
            new_name = 'reply_' + str(i)
            df.loc[df.index[-1], new_name] = data['text']
            i += 1
        if df.shape[0] == length + 1:
            df.drop(df.tail(1).index,inplace=True)
            return df
    return df

In [3]:
df_true = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/non-rumours/', 50)
df_true.shape

  self.obj[key] = infer_fill_value(value)


(50, 165)

In [4]:
df_rumors = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/rumours/', 50)
df_rumors.shape

(50, 57)

Data cleaning for word2vec conversion

In [6]:
# Dropping numeric features

df_cleaning_true_w2v = df_true.drop(df_true.columns[[1, 2]], axis=1)
df_cleaning_true_w2v.shape

(50, 163)

In [8]:
df_cleaning_rumors_w2v = df_rumors.drop(df_rumors.columns[[1, 2]], axis=1)
df_cleaning_rumors_w2v.shape

(50, 55)

In [11]:
# Applying lemmatization and tokenizing

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def finalise_cleaning(db):
    df_out = pd.DataFrame()
    def cleaner2(doc):
        txt = [token.lemma_ for token in doc if not token.is_stop]
        if len(txt) > 2:
            return ' '.join(txt)
    for i in range(db.shape[1]):
        print(i)
        brief_clean = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in db.iloc[:, i])
        txt = [cleaner2(doc) for doc in nlp.pipe(brief_clean, batch_size=5000, n_process=-1)]
        df_part = pd.DataFrame({'reply_'+str(i): txt})
        df_out = pd.concat([df_out, df_part], axis=1)
        df_out.columns = ['source_tweet' if x=='reply_0' else x for x in df_out.columns]
    return df_out

In [12]:
t = time()
df_true_clean = finalise_cleaning(df_cleaning_true_w2v)
print('Time to clean up true posts: {} mins'.format(round((time() - t) / 60, 2)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
Time to clean up true answers: 250.92 mins


In [15]:
df_true_clean.shape

(50, 163)

In [16]:
t = time()
df_rumors_clean = finalise_cleaning(df_cleaning_rumors_w2v)
df_rumors_clean.shape
print('Time to clean up rumors: {} mins'.format(round((time() - t) / 60, 2)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
Time to clean up rumors: 84.6 mins


In [17]:
df_rumors_clean.shape

(50, 55)

In [18]:
# Saving cleaned data for time-saving purposes in future implementations

os.chdir('./raw_data/')
df_true_clean.to_csv('cleaned_true_data.csv', index=False)
df_rumors_clean.to_csv('cleaned_rumors_data.csv', index=False)

In [None]:
# Loading data from file
df_true_clean = pd.read_csv('./raw_data/cleaned_true_data.csv')
df_true_clean.shape

In [None]:
df_rumors_clean = pd.read_csv('./raw_data/cleaned_rumors_data.csv')
df_rumors_clean.shape

In [22]:
# As for word2vec model we are using true values and rumors together, I will use one common wordlist

df_all = pd.concat([df_true_clean, df_rumors_clean], axis=0)
df_all.shape

(100, 163)

In [23]:
df_all.tail()

Unnamed: 0,source_tweet,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,reply_8,reply_9,...,reply_153,reply_154,reply_155,reply_156,reply_157,reply_158,reply_159,reply_160,reply_161,reply_162
45,break number people appear flee scene sydney...,sema beach iamnotshoute nicholasdole abcnew ...,kaitjb iamnotshoute nicholasdole abcnews mas...,sema beach kaitjb nicholasdole abcnews fuck ...,sema beach kaitjb nicholasdole abcnew thank ...,iamnotshoute kaitjb nicholasdole abcnew yeah...,,iamnotshoute kaitjb nicholasdole abcnew thin...,sema beach iamnotshoute nicholasdole abcnew ...,sema beach iamnotshoute nicholasdole abcnew ...,...,,,,,,,,,,
46,rt australian siege situation currently unfold...,,,,,,,,,,...,,,,,,,,,,
47,hostage involve sydneysiege text ok mother htt...,abhishek jusy like terrorist difference civi...,abhishek muslims terrorist doggggs,bbcworld think guy young brother kasab,ibedaboss yes extent u r correct u hv accept...,ibedaboss point religion fr terrorism bt ur ...,make sad bbcworld hostage involve sydneysiege ...,peeyushmalhotra bbcworld tht wht guess wait ...,abhishek watch backladh muslims australia fi...,ibedaboss correct bt muslim r not terrorist ...,...,,,,,,,,,,
48,breaking report hostage escape live ch sydne...,skynewsaust break report hostage escape live...,joseph m skynewsaust http t co qmgulquobv st...,,,skynewsaust jordansekulow default islamophob...,skynewsaust aus need idf help advice deal gu...,jesuspaddy minimumwade skynewsaust tired ter...,skynewsaust http t co qmgulquobv stop violen...,minimumwade joseph m agree,...,,,,,,,,,,
49,man haron monis id'd gunman hold captive sydne...,foxnew slimey islamist,foxnew kimmie tie isis take long,foxnew background dark light glass,foxnew ugh bearded piece shit,foxnew sniper,foxnew hang ass,foxnew treat way treat lesson aussie kill ba...,foxnew gee cute quick hillaryclinton terrori...,foxnew iranian refugee hold hostage australi...,...,,,,,,,,,,


In [44]:
# Using bigrams to catch something more

from gensim.models.phrases import Phrases, Phraser

#Creating list of words

def df_to_wordlist(df):
    sent_all = []
    for i in range(df.shape[1]):
        try:
            sent = [row.split() for row in df.iloc[:, i]]
        except:
            sent = ''
        sent_all.extend(sent)
    return sent_all

In [45]:
wordlist = df_to_wordlist(df_all)

In [47]:
# Creating relevant phrases

phrases = Phrases(wordlist, min_count=30, progress_per=10000)

In [48]:
bigram = Phraser(phrases)

In [50]:
# Transforming according to bigrams detected

sentences = bigram[wordlist]

In [52]:
# A little sanity check
# Most frequent words

from collections import defaultdict

word_freq = defaultdict(int)
for wordlist in sentences:
    for i in wordlist:
        word_freq[i] += 1
len(word_freq)

551

In [53]:
# ... and most popular words

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['http_t',
 'co',
 'sydney',
 'sydneysiege',
 'hostage',
 'cafe',
 'gunman',
 'police',
 'hold',
 'break']

Constructing word2vec model

In [54]:
import multiprocessing
from gensim.models import Word2Vec

In [56]:
cores = multiprocessing.cpu_count()
cores

16

In [58]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

In [59]:
# Building vocabulary table

t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocabulary table: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocabulary table: 0.0 mins


In [60]:
# Training the model

t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.0 mins


In [61]:
# Storing model for future use

w2v_model.save("word2vec.model")

In [62]:
# Storing vectors for words

from gensim.models import KeyedVectors

In [64]:
word_vectors = w2v_model.wv
word_vectors.save("word2vec.wordvectors")

In [None]:
# Loading wordvectors

w2v_model = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [None]:
# vector = w2v_model.wv['word']

Modifying databases so that every word has it's own cell. Later replacing words with vectors

In [80]:
# Adding labels for data, 1 - truth, 0 - rumor

df_true_labelled = df_true_clean.copy()
df_true_labelled['Label'] = '1'
df_true_labelled.shape

(50, 164)

In [81]:
df_rumors_labelled = df_rumors_clean.copy()
df_rumors_labelled['Label'] = '0'
df_rumors_labelled.shape

(50, 56)

In [90]:
# Joining labelled datasets

df_all_labelled = pd.concat([df_true_labelled, df_rumors_labelled], axis=0)
df_all_labelled.reset_index(drop=True)
df_all_labelled.shape

(100, 164)

In [91]:
df_all_labelled.head()

Unnamed: 0,source_tweet,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,reply_8,reply_9,...,reply_154,reply_155,reply_156,reply_157,reply_158,reply_159,reply_160,reply_161,reply_162,Label
0,verse quran want hold hostage chocolate shop s...,withnodriver shit sherlock real say decade t...,withnodriver basznocz religion not developme...,fury jen thetweetofgod blasphemer,runitright thetweetofgod thor,usairam mohame like fuck little aisha like y...,rstarovich thetweetofgod hahahha right plent...,usairam basznocz kid make hold islamic flag ...,nselby exactly hold rte thetweetofgod end ac...,withnodriver basznocz religion religion peac...,...,,,,,,,,,,1
1,mosque synagogue church invite public pray syd...,elronxenu surprised take hostage outnumber b...,teamoyeniyi specifically guy way big event d...,teamoyeniyi long kill hostage excellent chan...,elronxenu hole big event happen,abcnewssydney sydneysiege disturbing news de...,abcnewssydney jarrodmckenna amen,elronxenu inclined think outside square smel...,abcnewssydney oh cool need religion,teamoyeniyi big event postpone foil hole sta...,...,,,,,,,,,,1
2,illridewithyou viral aussie solidarity amid ...,melonrouge ride u ur mom,sirmomonothomo melonrouge lol tweet say earl...,sirmomonothomo rt com truth hurt dog breath,rhu rt com u look like ur wait cum shot ugly...,sirmomonothomo iwon'tridewithyou,rt com abbot shirtfront terrorist,rt com isis islam terror,rt com islam hijack isis illridewithyou,rt com oh bloody good world know isis amp pa...,...,,,,,,,,,,1
3,continue monitor situation sydney closely touc...,,,,,,,,,,...,,,,,,,,,,1
4,love illridewithyou initiative spread love com...,katsiecris agree important remember today ev...,amnestynsw quote thing think hear illridewit...,,,,,,,,...,,,,,,,,,,1


In [96]:
def word_separator(df):
    df_out = pd.DataFrame()
    for i in range(df.shape[0]):
        print(str(i))
        k_max = int(df.iloc[:,i].str.split().str.len().max()) # max No of words in set column
        name = str(df.columns[i])
        listing = df.iloc[:,i]
        df_part = pd.DataFrame()
        for j in range(df.shape[1]):
            if j < len(listing):
                sentence = listing[j]
            else:
                sentence = []
            df_part_0 = pd.DataFrame()
            for k in range(k_max):
                subname = name + '_' + str(k)
                words = str(sentence).split()
                if k < len(words):
                    word = words[k]
                else:
                    word = ''
                df_part_1 = pd.DataFrame({subname: [word]})
                df_part_0 = pd.concat([df_part_0, df_part_1], axis=1)
            df_part = pd.concat([df_part, df_part_0], axis=0)
        df_out = pd.concat([df_out, df_part], axis=1)
    return df_out

In [97]:
t = time()
df_all_separate = word_separator(df_true_labelled)
df_all_separate.shape
print('Time to reformat dataframe: {} mins'.format(round((time() - t) / 60, 2)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
Time to reformat dataframe: 1.95 mins


In [98]:
df_all_separate.head()


Unnamed: 0,source_tweet_0,source_tweet_1,source_tweet_2,source_tweet_3,source_tweet_4,source_tweet_5,source_tweet_6,source_tweet_7,source_tweet_8,source_tweet_9,...,reply_49_5,reply_49_6,reply_49_7,reply_49_8,reply_49_9,reply_49_10,reply_49_11,reply_49_12,reply_49_13,reply_49_14
0,verse,quran,want,hold,hostage,chocolate,shop,sydney,terrorist,fuck,...,,,,,,,,,,
0,mosque,synagogue,church,invite,public,pray,sydney,hostage,tonight,sydneysiege,...,,,,,,,,,,
0,illridewithyou,viral,aussie,solidarity,amid,sydneysiege,islamic,state,link,http,...,,,,,,,,,,
0,continue,monitor,situation,sydney,closely,touch,juliebishopmp,express,thought,prayer,...,,,,,,,,,,
0,love,illridewithyou,initiative,spread,love,compassion,thought,affect,sydney,http,...,,,,,,,,,,
