# Answers to coding questions in deep learning
Peeter Niidas

In [1]:
# Make some preparations

import glob
import json
import pandas as pd
import numpy as np
import re
import spacy
from gensim.models import Word2Vec
import os
import sys
from time import time

import matplotlib.pyplot as plt
#%tensorflow_version 1.x

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

1. Download either the PHEME5 or PHEME9 dataset.
2. Choose any one event from the dataset and load its first 100 JSON files into
python memory.
3. Extract and load attributes such as source tweets, reply tweets, and favorites
count, labels of the source tweets.

In [2]:
# Get filenames and load data

def read_rename_data (dirr, length):
    path = dirr
    df = pd.DataFrame()
    for filename in glob.iglob(path + '**/*.json', recursive=True):
        if 'source-tweet' in filename:
            i = 1
            f = open(filename)
            data = json.load(f)
            new_row = {'source_tweet': data['text'], 'favorite_count': data['favorite_count'], 'retweet_count':data['retweet_count']}
            df = df.append(new_row, ignore_index=True)
            f.close()
        elif 'reactions' in filename:
            f = open(filename)
            data = json.load(f)
            new_name = 'reply_' + str(i)
            df.loc[df.index[-1], new_name] = data['text']
            i += 1
        if df.shape[0] == length + 1:
            df.drop(df.tail(1).index,inplace=True)
            return df
    return df

In [68]:
df_true = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/non-rumours/', 2)
df_true.shape

(2, 73)

In [None]:
df_rumors = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/rumours/', 100)
df_rumors.shape

In [69]:
df_true.head()

Unnamed: 0,source_tweet,favorite_count,retweet_count,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,...,reply_61,reply_62,reply_63,reply_64,reply_65,reply_66,reply_67,reply_68,reply_69,reply_70
0,There are no verses in the Quran about Me want...,4533.0,4465.0,@withnodriver no shit sherlock. are you for re...,@withnodriver @basznocz Religion doesnt have t...,@fury_jen @TheTweetOfGod blasphemer.,@runitright1 @TheTweetOfGod Sometime also Thor..,@Usairam1 mohamed liked to fuck little aisha. ...,@RStarovich @TheTweetOfGod Hahahha...you're ri...,@Usairam1 @basznocz you're kidding they're mak...,...,@nselby @TheTweetOfGod no one has guns? It sa...,"@basznocz islam is a lie, just a suppressive m...",@withnodriver @basznocz Haha. All the science ...,@cIeopatrio @TheTweetOfGod you know it was a j...,@timpoliti @runitright1 @TheTweetOfGod Allah t...,@fury_jen @TheTweetOfGod his voicemail is ful...,@withnodriver @basznocz Were WW1 WW2 and many ...,@basznocz @TheTweetOfGod religion is a complet...,@weisblumen @IncredulousMark @withnodriver @ba...,@TheTweetOfGod sorry god but you're wrong. hav...
1,"Mosques, synagogues and churches are inviting ...",116.0,157.0,@elronxenu I'm surprised he hasn't been taken ...,@TeamOyeniyi Specifically: guy was on his way ...,@TeamOyeniyi So long as he doesn't kill any ho...,@elronxenu So is he just holed up until the bi...,@abcnewsSydney #sydneysiege Very disturbing ne...,@abcnewsSydney @jarrodmckenna Amen.,@elronxenu I'm inclined to think outside the s...,...,,,,,,,,,,


Data cleaning for word2vec conversion

In [70]:
# Dropping numeric features

df_cleaning_w2v = df_true.drop(df_true.columns[[1, 2]], axis=1)
df_cleaning_w2v.head()



Unnamed: 0,source_tweet,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,reply_8,reply_9,...,reply_61,reply_62,reply_63,reply_64,reply_65,reply_66,reply_67,reply_68,reply_69,reply_70
0,There are no verses in the Quran about Me want...,@withnodriver no shit sherlock. are you for re...,@withnodriver @basznocz Religion doesnt have t...,@fury_jen @TheTweetOfGod blasphemer.,@runitright1 @TheTweetOfGod Sometime also Thor..,@Usairam1 mohamed liked to fuck little aisha. ...,@RStarovich @TheTweetOfGod Hahahha...you're ri...,@Usairam1 @basznocz you're kidding they're mak...,@nselby exactly. I'm holding off RTing this @T...,@withnodriver @basznocz Our religion is religi...,...,@nselby @TheTweetOfGod no one has guns? It sa...,"@basznocz islam is a lie, just a suppressive m...",@withnodriver @basznocz Haha. All the science ...,@cIeopatrio @TheTweetOfGod you know it was a j...,@timpoliti @runitright1 @TheTweetOfGod Allah t...,@fury_jen @TheTweetOfGod his voicemail is ful...,@withnodriver @basznocz Were WW1 WW2 and many ...,@basznocz @TheTweetOfGod religion is a complet...,@weisblumen @IncredulousMark @withnodriver @ba...,@TheTweetOfGod sorry god but you're wrong. hav...
1,"Mosques, synagogues and churches are inviting ...",@elronxenu I'm surprised he hasn't been taken ...,@TeamOyeniyi Specifically: guy was on his way ...,@TeamOyeniyi So long as he doesn't kill any ho...,@elronxenu So is he just holed up until the bi...,@abcnewsSydney #sydneysiege Very disturbing ne...,@abcnewsSydney @jarrodmckenna Amen.,@elronxenu I'm inclined to think outside the s...,"@abcnewsSydney Oh cool, just what we need... M...","@TeamOyeniyi No, the bigger event is postponed...",...,,,,,,,,,,


In [11]:
#Manual, step-by step

from time import time  # To time our operations
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [16]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [28]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df_cleaning_w2v['source_tweet'])

In [29]:
t = time()
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_process=-1)]
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 1.53 mins


In [30]:
df_cleaned = pd.DataFrame({'clean': txt})
df_cleaned.shape

(100, 1)

In [31]:
df_cleaned


Unnamed: 0,clean
0,verse quran want hold hostage chocolate shop s...
1,mosque synagogue church invite public pray syd...
2,illridewithyou viral aussie solidarity amid ...
3,continue monitor situation sydney closely touc...
4,love illridewithyou initiative spread love com...
...,...
95,pray hostage family sydney
96,thinking sydney today x
97,sydney harbour bridge close accident cahill ex...
98,break news sydneysiege accord police http t co...


In [59]:
# Superslow

def cleaner(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

def data_cleaning(x):
    exclusion_list = ['#', '@']
    words = str(x).split()
    pre_cleaned = [word for word in words if all(ch not in word for ch in exclusion_list)] #removing hashtags, mentions, mail addresses
    brief_clean = (re.sub("[^A-Za-z']+", ' ', str(pre_cleaned)).lower())
    # print(brief_clean)
    txt = [cleaner(doc) for doc in nlp.pipe(brief_cleaning, batch_size=1000, n_process=-1)]
    return txt

In [60]:
t = time()
df_true_cleaned = df_cleaning_w2v.applymap(data_cleaning)
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Process Process-706:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/spacy/language.py", line 2256, in _apply_pipes
    texts_with_ctx = receiver.get()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/queues.py", line 97, in get
    res = self._recv_bytes()
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File 

KeyboardInterrupt: 

In [63]:
df_true_cleaned



NameError: name 'df_true_cleaned' is not defined

In [132]:
# Data cleaning trying to speed it up

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaner(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

def data_cleansing(x):
    exclusion_list = ['#', '@']
    words = str(x).split()
    pre_cleaned = [word for word in words if all(ch not in word for ch in exclusion_list)] #removing hashtags, mentions, mail addresses
    return ' '.join(pre_cleaned)

In [133]:
df_true_precean = df_cleaning_w2v.applymap(data_cleansing)
df_true_precean


Unnamed: 0,source_tweet,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,reply_8,reply_9,...,reply_61,reply_62,reply_63,reply_64,reply_65,reply_66,reply_67,reply_68,reply_69,reply_70
0,There are no verses in the Quran about Me want...,no shit sherlock. are you for real? and you sa...,Religion doesnt have to do anything with devel...,blasphemer.,Sometime also Thor..,mohamed liked to fuck little aisha. he liked 6...,Hahahha...you're right. But we're doing plenty...,you're kidding they're making them hold islami...,exactly. I'm holding off RTing this it may end...,Our religion is religion of peace. God does no...,...,no one has guns? It says he's armed.,"islam is a lie, just a suppressive measure for...",Haha. All the science you have rn is due to th...,"you know it was a joke, right?","Allah traslate to ""The God"" in Arabic so yeah.",his voicemail is full. Please try later.If you...,Were WW1 WW2 and many other wars that killed m...,religion is a complete fallacy,Actually this verse was intended for those ido...,sorry god but you're wrong. haven't you read y...
1,"Mosques, synagogues and churches are inviting ...",I'm surprised he hasn't been taken down by the...,Specifically: guy was on his way to some bigge...,"So long as he doesn't kill any hostages, he ha...",So is he just holed up until the bigger event ...,Very disturbing news and my deepest sympathy w...,Amen.,I'm inclined to think outside the square on th...,"Oh cool, just what we need... MORE religion!","No, the bigger event is postponed or foiled. H...",...,,,,,,,,,,


In [143]:
# Applying lemmatization and tokenizing

def finalise_cleaning(db):
    df_out = pd.DataFrame()
    def cleaner2(doc):
        txt = [token.lemma_ for token in doc if not token.is_stop]
        if len(txt) > 2:
            return ' '.join(txt)
    for i in range(db.shape[1]):
        print(i)
        brief_clean = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in db.iloc[:, i])
        txt = [cleaner2(doc) for doc in nlp.pipe(brief_clean, batch_size=5000, n_process=-1)]
        df_part = pd.DataFrame({'reply_'+str(i): txt})
        df_out = pd.concat([df_out, df_part], axis=1)
        df_out.columns = ['source_tweet' if x=='reply_0' else x for x in df_out.columns]
    return df_out

In [190]:
# Making short df for testing

df_test_df = df_true_precean.loc[:, ['source_tweet', 'reply_1']]
df_test_df.shape

(2, 2)

In [191]:
df_test_df.iloc[:, 0]

0    There are no verses in the Quran about Me want...
1    Mosques, synagogues and churches are inviting ...
Name: source_tweet, dtype: object

In [192]:
t = time()
df_test = finalise_cleaning(df_test_df)
print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

0
0    There are no verses in the Quran about Me want...
1    Mosques, synagogues and churches are inviting ...
Name: source_tweet, dtype: object
<generator object finalise_cleaning.<locals>.<genexpr> at 0x7fb09300e7b0>
['verse quran want hold hostage chocolate shop sydney terrorist fuck', 'mosque synagogue church invite public pray sydney hostage']
                                                col0
0  verse quran want hold hostage chocolate shop s...
1  mosque synagogue church invite public pray syd...
(2, 1)
1
0    no shit sherlock. are you for real? and you sa...
1    I'm surprised he hasn't been taken down by the...
Name: reply_1, dtype: object
<generator object finalise_cleaning.<locals>.<genexpr> at 0x7fb09300e740>
['shit sherlock real say decade try millenia learn history', 'surprised take hostage outnumber badly']
                                                col1
0  shit sherlock real say decade try millenia lea...
1             surprised take hostage outnumber badly
(2, 2

In [238]:
df_test

Unnamed: 0,col0,col1
0,verse quran want hold hostage chocolate shop s...,shit sherlock real say decade try millenia lea...
1,mosque synagogue church invite public pray syd...,surprised take hostage outnumber badly


In [206]:
# Df -> Df of modules
#df_test.col0.str.len().max() # finds max value of symbols in column
df_test.iloc[:,0].str.split().str.len().max() # Find longest sentence in column

10

In [269]:
#str(df_test.iloc[:1,0][0]).split()[0]
str(df_test.iloc[:,0][0]).split()

['verse',
 'quran',
 'want',
 'hold',
 'hostage',
 'chocolate',
 'shop',
 'sydney',
 'terrorist',
 'fuck']

In [345]:
def word_separator(df):
    df_out = pd.DataFrame()
    for i in range(df.shape[0]):
        k_max = df.iloc[:,i].str.split().str.len().max() # max No of words in set column
        print(str(k_max))
        name = str(df.columns[i])
        listing = df.iloc[:,i]
        df_part = pd.DataFrame()
        for j in range(df.shape[1]):
            sentence = listing[j]
            print(str(sentence))
            df_part_0 = pd.DataFrame()
            for k in range(k_max):
                subname = name + '_' + str(k)
                words = str(sentence).split()
                if k < len(words):
                    word = words[k]
                else:
                    word = ''
                print(str(subname))
                print(str(word))
                df_part_1 = pd.DataFrame({subname: [word]})
                df_part_0 = pd.concat([df_part_0, df_part_1], axis=1)
                print(df_part_0.shape)
            print(df_part_0.shape)
            #return df_part_0
            df_part = pd.concat([df_part, df_part_0], axis=0)
        #return df_part
        df_out = pd.concat([df_out, df_part], axis=1)
    return df_out


In [346]:
df_ttest = word_separator(df_test)

10
verse quran want hold hostage chocolate shop sydney terrorist fuck
col0_0
verse
(1, 1)
col0_1
quran
(1, 2)
col0_2
want
(1, 3)
col0_3
hold
(1, 4)
col0_4
hostage
(1, 5)
col0_5
chocolate
(1, 6)
col0_6
shop
(1, 7)
col0_7
sydney
(1, 8)
col0_8
terrorist
(1, 9)
col0_9
fuck
(1, 10)
(1, 10)
mosque synagogue church invite public pray sydney hostage
col0_0
mosque
(1, 1)
col0_1
synagogue
(1, 2)
col0_2
church
(1, 3)
col0_3
invite
(1, 4)
col0_4
public
(1, 5)
col0_5
pray
(1, 6)
col0_6
sydney
(1, 7)
col0_7
hostage
(1, 8)
col0_8

(1, 9)
col0_9

(1, 10)
(1, 10)
9
shit sherlock real say decade try millenia learn history
col1_0
shit
(1, 1)
col1_1
sherlock
(1, 2)
col1_2
real
(1, 3)
col1_3
say
(1, 4)
col1_4
decade
(1, 5)
col1_5
try
(1, 6)
col1_6
millenia
(1, 7)
col1_7
learn
(1, 8)
col1_8
history
(1, 9)
(1, 9)
surprised take hostage outnumber badly
col1_0
surprised
(1, 1)
col1_1
take
(1, 2)
col1_2
hostage
(1, 3)
col1_3
outnumber
(1, 4)
col1_4
badly
(1, 5)
col1_5

(1, 6)
col1_6

(1, 7)
col1_7

(1, 8)
col1_

In [347]:
df_ttest

Unnamed: 0,col0_0,col0_1,col0_2,col0_3,col0_4,col0_5,col0_6,col0_7,col0_8,col0_9,col1_0,col1_1,col1_2,col1_3,col1_4,col1_5,col1_6,col1_7,col1_8
0,verse,quran,want,hold,hostage,chocolate,shop,sydney,terrorist,fuck,shit,sherlock,real,say,decade,try,millenia,learn,history
0,mosque,synagogue,church,invite,public,pray,sydney,hostage,,,surprised,take,hostage,outnumber,badly,,,,


In [None]:
#Creating list of words

sent_all = []
for i in range(df_test.shape[1]):
    sent2 = [row.split() for row in df_test.iloc[:, i]]
    print(sent2)
    sent_all.extend(sent2)
    print(sent_all)

In [169]:
sent_all2 = sent_all

In [179]:
sent_combined
#sent3


In [180]:
sent_all.extend(sent_all2)
sent_all

['surprised',
 'take',
 'hostage',
 'outnumber',
 'badly',
 ['verse',
  'quran',
  'want',
  'hold',
  'hostage',
  'chocolate',
  'shop',
  'sydney',
  'terrorist',
  'fuck'],
 ['mosque',
  'synagogue',
  'church',
  'invite',
  'public',
  'pray',
  'sydney',
  'hostage'],
 ['shit',
  'sherlock',
  'real',
  'say',
  'decade',
  'try',
  'millenia',
  'learn',
  'history'],
 ['surprised', 'take', 'hostage', 'outnumber', 'badly'],
 ['verse',
  'quran',
  'want',
  'hold',
  'hostage',
  'chocolate',
  'shop',
  'sydney',
  'terrorist',
  'fuck'],
 ['mosque',
  'synagogue',
  'church',
  'invite',
  'public',
  'pray',
  'sydney',
  'hostage'],
 ['shit',
  'sherlock',
  'real',
  'say',
  'decade',
  'try',
  'millenia',
  'learn',
  'history'],
 ['surprised', 'take', 'hostage', 'outnumber', 'badly']]

In [181]:
phrases = Phrases(sent_all, min_count=30, progress_per=10000)

INFO - 12:57:54: collecting all words and their counts
INFO - 12:57:54: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 12:57:54: collected 103 token types (unigram + bigrams) from a corpus of 98 words and 13 sentences
INFO - 12:57:54: merged Phrases<103 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 12:57:54: Phrases lifecycle event {'msg': 'built Phrases<103 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.01s', 'datetime': '2022-08-16T12:57:54.932667', 'gensim': '4.2.0', 'python': '3.8.6 (v3.8.6:db455296be, Sep 23 2020, 13:31:39) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [182]:

bigram = Phraser(phrases)

INFO - 12:57:58: exporting phrases from Phrases<103 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 12:57:58: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<0 phrases, min_count=30, threshold=10.0> from Phrases<103 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.00s', 'datetime': '2022-08-16T12:57:58.925418', 'gensim': '4.2.0', 'python': '3.8.6 (v3.8.6:db455296be, Sep 23 2020, 13:31:39) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [183]:
sentences = bigram[sent_all]

TypeError: unhashable type: 'list'

In [175]:
word_freq = defaultdict(int)
for sent_all in sentences:
    for i in sent_all:
        word_freq[i] += 1
len(word_freq)

29

In [176]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['hostage',
 'sydney',
 'verse',
 'quran',
 'want',
 'hold',
 'chocolate',
 'shop',
 'terrorist',
 'fuck']

In [None]:

from gensim.models.phrases import Phrases, Phraser

In [34]:
sent = [row.split() for row in df_cleaned['clean']]

In [149]:
sent

['send',
 'love',
 'sydney',
 'illridewithyou',
 'sydneysiege',
 'http_t',
 'co',
 'cfdtyzehdp']

In [None]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [36]:
bigram = Phraser(phrases)

INFO - 15:44:35: exporting phrases from Phrases<1480 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 15:44:35: FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<2 phrases, min_count=30, threshold=10.0> from Phrases<1480 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> in 0.00s', 'datetime': '2022-08-15T15:44:35.333348', 'gensim': '4.2.0', 'python': '3.8.6 (v3.8.6:db455296be, Sep 23 2020, 13:31:39) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [37]:
sentences = bigram[sent]

In [38]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

572

In [39]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['sydneysiege',
 'http_t',
 'co',
 'sydney',
 'hostage',
 'illridewithyou',
 'muslim',
 'police',
 'people',
 'cafe']