# Answers to coding questions in deep learning
Peeter Niidas

In [1]:
# Make some preparations

import glob
import json
import pandas as pd
import numpy as np
import re
import spacy
from gensim.models import Word2Vec
import os
import sys
from time import time

import matplotlib.pyplot as plt
#%tensorflow_version 1.x

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

1. Download either the PHEME5 or PHEME9 dataset.
2. Choose any one event from the dataset and load its first 100 JSON files into
python memory.
3. Extract and load attributes such as source tweets, reply tweets, and favorites
count, labels of the source tweets.

In [2]:
# Get filenames and load data

def read_rename_data (dirr, length):
    path = dirr
    df = pd.DataFrame()
    for filename in glob.iglob(path + '**/*.json', recursive=True):
        if 'source-tweet' in filename:
            i = 1
            f = open(filename)
            data = json.load(f)
            new_row = {'source_tweet': data['text'], 'favorite_count': data['favorite_count'], 'retweet_count':data['retweet_count']}
            df = df.append(new_row, ignore_index=True)
            f.close()
        elif 'reactions' in filename:
            f = open(filename)
            data = json.load(f)
            new_name = 'reply_' + str(i)
            df.loc[df.index[-1], new_name] = data['text']
            i += 1
        if df.shape[0] == length + 1:
            df.drop(df.tail(1).index,inplace=True)
            return df
    return df

In [3]:
df_true = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/non-rumours/', 50)
df_true.shape

  self.obj[key] = infer_fill_value(value)


(50, 165)

In [4]:
df_rumors = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/rumours/', 50)
df_rumors.shape

(50, 57)

In [5]:
# Adding labels for data, 1 - truth, 0 - rumor

df_true_labelled = df_true.copy()
df_true_labelled['Label'] = '1'
df_true_labelled.shape

(50, 166)

In [6]:
df_rumors_labelled = df_rumors.copy()
df_rumors_labelled['Label'] = '0'
df_rumors_labelled.shape

(50, 58)

In [7]:
# Joining labelled datasets

df_all_labelled = pd.concat([df_true_labelled, df_rumors_labelled], axis=0)
df_all_labelled.reset_index(drop=True)
df_all_labelled.shape

(100, 166)

In [8]:
# Saving datasets

os.chdir('./raw_data/')
df_true_labelled.to_csv('raw_true_labelled.csv', index=False)
df_rumors_labelled.to_csv('raw_rumors_labelled.csv', index=False)
df_all_labelled.to_csv('raw_all_labelled.csv', index=False)

In [9]:
# Loading datasets

df_true_labelled = pd.read_csv('raw_true_labelled.csv')
df_true_labelled.shape

(50, 166)

In [10]:
df_rumors_labelled = pd.read_csv('raw_rumors_labelled.csv')
df_rumors_labelled.shape

(50, 58)

In [11]:
df_all_labelled = pd.read_csv('raw_all_labelled.csv')
df_all_labelled.shape

(100, 166)

Data cleaning for word2vec conversion

In [12]:
# Dropping numeric features

df_cleaning_w2v = df_all_labelled.drop(df_all_labelled.columns[[1, 2, -1]], axis=1)
df_cleaning_w2v.shape

(100, 163)

In [25]:
# removing URL's, user mentions, hashtags and e-mail addresses

def data_cleaning(x):
    exclusion_list = ['#', '@']
    x = str(x).lower()
    cleaned_url = re.sub(r"https?://\S+", "", x)
    words = str(cleaned_url).split()
    cleaned = [word for word in words if all(ch not in word for ch in exclusion_list)]
    return ' '.join(cleaned)


Unnamed: 0,source_tweet,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,reply_8,reply_9,...,reply_153,reply_154,reply_155,reply_156,reply_157,reply_158,reply_159,reply_160,reply_161,reply_162
0,There are no verses in the Quran about Me want...,@withnodriver no shit sherlock. are you for re...,@withnodriver @basznocz Religion doesnt have t...,@fury_jen @TheTweetOfGod blasphemer.,@runitright1 @TheTweetOfGod Sometime also Thor..,@Usairam1 mohamed liked to fuck little aisha. ...,@RStarovich @TheTweetOfGod Hahahha...you're ri...,@Usairam1 @basznocz you're kidding they're mak...,@nselby exactly. I'm holding off RTing this @T...,@withnodriver @basznocz Our religion is religi...,...,,,,,,,,,,
1,"Mosques, synagogues and churches are inviting ...",@elronxenu I'm surprised he hasn't been taken ...,@TeamOyeniyi Specifically: guy was on his way ...,@TeamOyeniyi So long as he doesn't kill any ho...,@elronxenu So is he just holed up until the bi...,@abcnewsSydney #sydneysiege Very disturbing ne...,@abcnewsSydney @jarrodmckenna Amen.,@elronxenu I'm inclined to think outside the s...,"@abcnewsSydney Oh cool, just what we need... M...","@TeamOyeniyi No, the bigger event is postponed...",...,,,,,,,,,,
2,#illridewithyou: Viral Aussie solidarity amid ...,@MelonRouge I'll ride u and ur mom,@sirmomonothomo @MelonRouge \nLol\nKeep tweeti...,@sirmomonothomo @RT_com truth hurts dogs breath,@rhu71 @RT_com u look like ur waiting for a cu...,@sirmomonothomo #iwon'tridewithyou,@RT_com Will Abbot shirtfront terrorist?,@RT_com ISIS is Islam is terror,@RT_com ISLAM is being hijacked by ISIS. #ill...,@RT_com Oh bloody good! The whole world knows ...,...,,,,,,,,,,
3,We continue to monitor the situation in #Sydne...,,,,,,,,,,...,,,,,,,,,,
4,We love the #illridewithyou initiative. Spread...,@KatSiecris we agree. It's important to rememb...,@AmnestyNSW this quote was the first thing I t...,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,#breaking a number of people appear to have ju...,@Sema4beach @iamnotshouting @NicholasDole @ABC...,@kaitjb @iamnotshouting @NicholasDole @ABCNews...,@Sema4beach @kaitjb @NicholasDole @ABCNews24 f...,@Sema4beach @kaitjb @NicholasDole @ABCNews24 ...,@iamnotshouting @kaitjb @NicholasDole @ABCNews...,@NicholasDole,@iamnotshouting @kaitjb @NicholasDole @ABCNews...,@Sema4beach @iamnotshouting @NicholasDole @ABC...,@Sema4beach @iamnotshouting @NicholasDole @ABC...,...,,,,,,,,,,
96,RT @australian: Siege situation currently unfo...,,,,,,,,,,...,,,,,,,,,,
97,"Hostage involved in #SydneySiege texts ""I'm ok...",@abhishek2928 jusy like terrorists say there i...,@abhishek2928 muslims and terrorists. Doggggs,@BBCWorld I think this guy is younger brother ...,"@ibedaboss123 yes, to some extent u r correct....",@ibedaboss123 we can't point out to any religi...,Makes me sad. “@BBCWorld: Hostage involved in ...,@peeyushmalhotra @BBCWorld thts wht. I guess w...,@abhishek2928 watch the backladh muslims get i...,@ibedaboss123 may be correct bt all Muslims r ...,...,,,,,,,,,,
98,#BREAKING: Reports that some of the hostages h...,“@SkyNewsAust: #BREAKING: Reports that some of...,@Joseph_M20 @SkyNewsAust http://t.co/QmgulquOb...,@SkyNewsAust,@SkyNewsAust,@SkyNewsAust @JordanSekulow \n\nDefault #Islam...,@SkyNewsAust .AUS need to get some IDF HELP/AD...,@jesuspaddy @minimumwade @skynewsaust I'm tire...,@SkyNewsAust http://t.co/QmgulquObV STOP THE V...,@MinimumWade @Joseph_M20 I agree with him,...,,,,,,,,,,


In [14]:
df_cleaning_w2v = df_cleaning_w2v.applymap(data_cleaning)

In [17]:
# Applying lemmatization and tokenizing

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def finalise_cleaning(db):
    df_out = pd.DataFrame()
    def cleaner2(doc):
        txt = [token.lemma_ for token in doc if not token.is_stop]
        if len(txt) > 2:
            return ' '.join(txt)
    for i in range(db.shape[1]):
        brief_clean = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in db.iloc[:, i])
        txt = [cleaner2(doc) for doc in nlp.pipe(brief_clean, batch_size=5000, n_process=-1)]
        df_part = pd.DataFrame({'reply_'+str(i): txt})
        df_out = pd.concat([df_out, df_part], axis=1)
        df_out.columns = ['source_tweet' if x=='reply_0' else x for x in df_out.columns]
    return df_out

In [18]:
t = time()
df_clean = finalise_cleaning(df_cleaning_w2v)
print('Time to clean up all posts: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up all posts: 246.93 mins


In [19]:
df_clean.shape

(100, 163)

In [21]:
# Saving cleaned data for time-saving purposes in future implementations

df_clean.to_csv('cleaned_data.csv', index=False)

In [22]:
# Loading data from file

df_clean = pd.read_csv('cleaned_data.csv')
df_clean.shape

(100, 163)

In [24]:
# Using bigrams to catch something more

from gensim.models.phrases import Phrases, Phraser

#Creating list of words

def df_to_wordlist(df):
    sent_all = []
    for i in range(df.shape[1]):
        try:
            sent = [row.split() for row in df.iloc[:, i]]
        except:
            sent = ''
        sent_all.extend(sent)
    return sent_all

In [26]:
wordlist = df_to_wordlist(df_clean)

In [27]:
# Creating relevant phrases

phrases = Phrases(wordlist, min_count=30, progress_per=10000)

In [28]:
bigram = Phraser(phrases)

In [29]:
# Transforming according to bigrams detected

sentences = bigram[wordlist]

In [30]:
# A little sanity check
# Most frequent words

from collections import defaultdict

word_freq = defaultdict(int)
for wordlist in sentences:
    for i in wordlist:
        word_freq[i] += 1
len(word_freq)

407

In [31]:
# ... and most popular words

sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['sydney',
 'hostage',
 'cafe',
 'gunman',
 'police',
 'hold',
 'people',
 'situation',
 'siege',
 'flag']

Constructing word2vec model

In [32]:
import multiprocessing
from gensim.models import Word2Vec

In [33]:
cores = multiprocessing.cpu_count()
cores

16

In [34]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

In [35]:
# Building vocabulary table

t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocabulary table: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocabulary table: 0.0 mins


In [36]:
# Training the model

t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.0 mins


In [37]:
# Storing model for future use

w2v_model.save("word2vec.model")

In [38]:
# Storing vectors for words

from gensim.models import KeyedVectors

In [39]:
word_vectors = w2v_model.wv
word_vectors.save("word2vec.wordvectors")

In [None]:
# Loading wordvectors

w2v_model = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [None]:
# vector = w2v_model.wv['word']

Modifying databases so that every word has it's own cell. Later replacing words with vectors

In [43]:
# Adding labels to the dataset

labels = df_all_labelled['Label']
df_clean_labelled = df_clean.copy()
df_clean_labelled = df_clean_labelled.join(labels)
df_clean_labelled.shape

(100, 164)

In [47]:
def word_separator(df):
    df_out = pd.DataFrame()
    for i in range(df.shape[0]):
        df_line = pd.DataFrame({'ind':[i]})
        df_row = pd.DataFrame()
        for j in range(df.shape[1]):
            name = str(df.columns[j])
            try:
                sentence = str(df.iloc[:, j][i])
            except:
                sentence = []
            try:
                k_max = int(df.iloc[:,j].str.split().str.len().max())
            except:
                k_max = 1
            df_cell = pd.DataFrame()
            for k in range(k_max):
                subname = name + '_' + str(k)
                try:
                    words = sentence.split()
                except:
                    words = []
                if k < len(words):
                    word = words[k]
                else:
                    word = ''
                df_part = pd.DataFrame({subname: [word]})
                df_cell = pd.concat([df_cell, df_part], axis=1)
            df_row = pd.concat([df_row, df_cell], axis=1)
        df_line = pd.concat([df_line, df_row], axis=1)
        df_line.set_index('ind', inplace=True)
        df_out = pd.concat([df_out, df_line], axis=0, ignore_index=True)
    return df_out

In [48]:
t = time()
df_all_separate = word_separator(df_clean_labelled)
df_all_separate.shape
print('Time to reformat dataframe: {} mins'.format(round((time() - t) / 60, 2)))

Time to reformat dataframe: 6.85 mins


In [49]:
df_all_separate.shape

(100, 1412)

In [51]:
df_all_separate.to_csv('cleaned_separate_all_data.csv', index=False)

Converting words into vectors

In [137]:
def words_2_vec(x):
    vector = w2v_model.wv[x]
    return x

In [138]:
t = time()
df_vectorised_all = df_all_separate.applymap(words_2_vec)
print('Time to vectorise dataframe: {} mins'.format(round((time() - t) / 60, 2)))

KeyError: "Key 'verse' not present"

In [None]:
df_vectorised_all.head()