# Answers to coding questions in deep learning
Peeter Niidas

In [1]:
# Make some preparations

import glob
import json
import pandas as pd
import numpy as np
import re
import spacy
from gensim.models import Word2Vec
import os
import sys
from time import time

import matplotlib.pyplot as plt
#%tensorflow_version 1.x

import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

1. Download either the PHEME5 or PHEME9 dataset.
2. Choose any one event from the dataset and load its first 100 JSON files into
python memory.
3. Extract and load attributes such as source tweets, reply tweets, and favorites
count, labels of the source tweets.

In [2]:
# Get filenames and load data

def read_rename_data (dirr, length):
    path = dirr
    df = pd.DataFrame()
    for filename in glob.iglob(path + '**/*.json', recursive=True):
        if 'source-tweet' in filename:
            i = 1
            f = open(filename)
            data = json.load(f)
            new_row = {'source_tweet': data['text'], 'favorite_count': data['favorite_count'], 'retweet_count':data['retweet_count']}
            df = df.append(new_row, ignore_index=True)
            f.close()
        elif 'reactions' in filename:
            f = open(filename)
            data = json.load(f)
            new_name = 'reply_' + str(i)
            df.loc[df.index[-1], new_name] = data['text']
            i += 1
        if df.shape[0] == length:
            return df
    return df

In [3]:
df_true = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/non-rumours/', 100)
df_true.shape

  self.obj[key] = infer_fill_value(value)


(100, 344)

In [4]:
df_rumors = read_rename_data('./data/pheme-rnr-dataset/sydneysiege/rumours/', 100)
df_rumors.shape

(100, 94)

In [5]:
df_true.head()

Unnamed: 0,source_tweet,favorite_count,retweet_count,reply_1,reply_2,reply_3,reply_4,reply_5,reply_6,reply_7,...,reply_332,reply_333,reply_334,reply_335,reply_336,reply_337,reply_338,reply_339,reply_340,reply_341
0,There are no verses in the Quran about Me want...,4533.0,4465.0,@withnodriver no shit sherlock. are you for re...,@withnodriver @basznocz Religion doesnt have t...,@fury_jen @TheTweetOfGod blasphemer.,@runitright1 @TheTweetOfGod Sometime also Thor..,@Usairam1 mohamed liked to fuck little aisha. ...,@RStarovich @TheTweetOfGod Hahahha...you're ri...,@Usairam1 @basznocz you're kidding they're mak...,...,,,,,,,,,,
1,"Mosques, synagogues and churches are inviting ...",116.0,157.0,@elronxenu I'm surprised he hasn't been taken ...,@TeamOyeniyi Specifically: guy was on his way ...,@TeamOyeniyi So long as he doesn't kill any ho...,@elronxenu So is he just holed up until the bi...,@abcnewsSydney #sydneysiege Very disturbing ne...,@abcnewsSydney @jarrodmckenna Amen.,@elronxenu I'm inclined to think outside the s...,...,,,,,,,,,,
2,#illridewithyou: Viral Aussie solidarity amid ...,85.0,148.0,@MelonRouge I'll ride u and ur mom,@sirmomonothomo @MelonRouge \nLol\nKeep tweeti...,@sirmomonothomo @RT_com truth hurts dogs breath,@rhu71 @RT_com u look like ur waiting for a cu...,@sirmomonothomo #iwon'tridewithyou,@RT_com Will Abbot shirtfront terrorist?,@RT_com ISIS is Islam is terror,...,,,,,,,,,,
3,We continue to monitor the situation in #Sydne...,68.0,138.0,,,,,,,,...,,,,,,,,,,
4,We love the #illridewithyou initiative. Spread...,106.0,156.0,@KatSiecris we agree. It's important to rememb...,@AmnestyNSW this quote was the first thing I t...,,,,,,...,,,,,,,,,,


Cleaning data

In [6]:
df_source = df_true[['source_tweet']].copy()

In [10]:
df_source.head()

Unnamed: 0,source_tweet
0,There are no verses in the Quran about Me want...
1,"Mosques, synagogues and churches are inviting ..."
2,#illridewithyou: Viral Aussie solidarity amid ...
3,We continue to monitor the situation in #Sydne...
4,We love the #illridewithyou initiative. Spread...


In [11]:
sent = [row.split(' ') for row in df_source['source_tweet']]

In [14]:
model = Word2Vec(sent, min_count=1,vector_size= 50,workers=3, window =3, sg = 1)

In [18]:
from gensim.models.phrases import Phrases, Phraser

In [19]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [20]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [21]:
bigram = Phraser(phrases)

In [22]:
sentences = bigram[sent]

In [24]:
from collections import defaultdict

In [25]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

851

In [26]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['the',
 'to',
 '#sydneysiege',
 'in',
 'of',
 'is',
 'for',
 'Sydney',
 'and',
 'hostages']

In [27]:
import multiprocessing
from gensim.models import Word2Vec

In [28]:
cores = multiprocessing.cpu_count()

In [29]:
cores

16

In [31]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=cores-1)

In [33]:
from time import time
import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [34]:
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 00:04:33: collecting all words and their counts
INFO - 00:04:33: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 00:04:33: collected 851 word types from a corpus of 1635 raw words and 100 sentences
INFO - 00:04:33: Creating a fresh vocabulary
INFO - 00:04:33: Word2Vec lifecycle event {'msg': 'effective_min_count=20 retains 8 unique words (0.94% of original 851, drops 843)', 'datetime': '2022-08-11T00:04:33.505392', 'gensim': '4.2.0', 'python': '3.8.6 (v3.8.6:db455296be, Sep 23 2020, 13:31:39) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 00:04:33: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 285 word corpus (17.43% of original 1635, drops 1350)', 'datetime': '2022-08-11T00:04:33.506368', 'gensim': '4.2.0', 'python': '3.8.6 (v3.8.6:db455296be, Sep 23 2020, 13:31:39) \n[Clang 6.0 (clang-600.0.57)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO

Time to build vocab: 0.0 mins


In [38]:
w2v_model.wv.most_similar(positive=["and"])

KeyError: "Key 'and' not present in vocabulary"